From e5f89249fa2ac24dd8cc57b5d1f022025c9d2819 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 14 Jan 2021 13:27:18 +0000
Subject: [PATCH] Apply clang-format on math_brute_force (#1104)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 .../math_brute_force/FunctionList.cpp         |   79 +-
 .../math_brute_force/FunctionList.h           |  103 +-
 test_conformance/math_brute_force/Sleep.cpp   |  139 +-
 test_conformance/math_brute_force/Sleep.h     |    8 +-
 test_conformance/math_brute_force/Utility.cpp |   96 +-
 test_conformance/math_brute_force/Utility.h   |  217 +-
 test_conformance/math_brute_force/binary.cpp  | 1889 +++---
 .../math_brute_force/binaryOperator.cpp       | 1809 ++++--
 .../math_brute_force/binary_i.cpp             | 1555 +++--
 .../math_brute_force/binary_two_results_i.cpp | 1395 ++--
 test_conformance/math_brute_force/i_unary.cpp |  665 +-
 .../math_brute_force/macro_binary.cpp         | 1412 +++--
 .../math_brute_force/macro_unary.cpp          | 1063 ++--
 test_conformance/math_brute_force/mad.cpp     | 1646 ++---
 test_conformance/math_brute_force/main.cpp    | 1748 ++---
 .../math_brute_force/reference_math.cpp       | 5625 +++++++++--------
 .../math_brute_force/reference_math.h         |  378 +-
 test_conformance/math_brute_force/ternary.cpp | 1703 +++--
 test_conformance/math_brute_force/unary.cpp   | 1260 ++--
 .../math_brute_force/unary_two_results.cpp    | 1115 ++--
 .../math_brute_force/unary_two_results_i.cpp  |  865 ++-
 test_conformance/math_brute_force/unary_u.cpp |  745 ++-
 22 files changed, 14745 insertions(+), 10770 deletions(-)

diff --git a/test_conformance/math_brute_force/FunctionList.cpp b/test_conformance/math_brute_force/FunctionList.cpp
index a07fa069..c5185c6f 100644
--- a/test_conformance/math_brute_force/FunctionList.cpp
+++ b/test_conformance/math_brute_force/FunctionList.cpp
@@ -16,13 +16,13 @@
 #include "FunctionList.h"
 #include "reference_math.h"
 
-#define FTZ_ON  1
+#define FTZ_ON 1
 #define FTZ_OFF 0
-#define EXACT    0.0f
+#define EXACT 0.0f
 #define RELAXED_ON 1
 #define RELAXED_OFF 0
 
-#define STRINGIFY( _s)                  #_s
+#define STRINGIFY(_s) #_s
 
 // Only use ulps information in spir test
 #ifdef FUNCTION_LIST_ULPS_ONLY
@@ -51,25 +51,25 @@
         STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
-#define unaryF                NULL
-#define i_unaryF              NULL
-#define unaryF_u              NULL
-#define macro_unaryF          NULL
-#define binaryF               NULL
-#define binaryF_nextafter     NULL
-#define binaryOperatorF       NULL
-#define binaryF_i             NULL
-#define macro_binaryF         NULL
-#define ternaryF              NULL
-#define unaryF_two_results    NULL
-#define unaryF_two_results_i  NULL
+#define unaryF NULL
+#define i_unaryF NULL
+#define unaryF_u NULL
+#define macro_unaryF NULL
+#define binaryF NULL
+#define binaryF_nextafter NULL
+#define binaryOperatorF NULL
+#define binaryF_i NULL
+#define macro_binaryF NULL
+#define ternaryF NULL
+#define unaryF_two_results NULL
+#define unaryF_two_results_i NULL
 #define binaryF_two_results_i NULL
-#define mad_function          NULL
+#define mad_function NULL
 
-#define reference_sqrt        NULL
-#define reference_sqrtl       NULL
-#define reference_divide      NULL
-#define reference_dividel     NULL
+#define reference_sqrt NULL
+#define reference_sqrtl NULL
+#define reference_divide NULL
+#define reference_dividel NULL
 #define reference_relaxed_divide NULL
 
 #else // FUNCTION_LIST_ULPS_ONLY
@@ -102,24 +102,27 @@
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
 
-extern const vtbl _unary;               // float foo( float )
-extern const vtbl _unary_u;             // float foo( uint ),  double foo( ulong )
-extern const vtbl _i_unary;             // int foo( float )
-extern const vtbl _macro_unary;         // int foo( float ),  returns {0,1} for scalar, { 0, -1 } for vector
-extern const vtbl _binary;              // float foo( float, float )
-extern const vtbl _binary_nextafter;    // float foo( float, float ), special handling for nextafter
-extern const vtbl _binary_operator;     // float .op. float
-extern const vtbl _macro_binary;        // int foo( float, float ), returns {0,1} for scalar, { 0, -1 } for vector
-extern const vtbl _binary_i;            // float foo( float, int )
-extern const vtbl _ternary;             // float foo( float, float, float )
-extern const vtbl _unary_two_results;   // float foo( float, float * )
+extern const vtbl _unary; // float foo( float )
+extern const vtbl _unary_u; // float foo( uint ),  double foo( ulong )
+extern const vtbl _i_unary; // int foo( float )
+extern const vtbl _macro_unary; // int foo( float ),  returns {0,1} for scalar,
+                                // { 0, -1 } for vector
+extern const vtbl _binary; // float foo( float, float )
+extern const vtbl _binary_nextafter; // float foo( float, float ), special
+                                     // handling for nextafter
+extern const vtbl _binary_operator; // float .op. float
+extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for
+                                 // scalar, { 0, -1 } for vector
+extern const vtbl _binary_i; // float foo( float, int )
+extern const vtbl _ternary; // float foo( float, float, float )
+extern const vtbl _unary_two_results; // float foo( float, float * )
 extern const vtbl _unary_two_results_i; // float foo( float, int * )
 extern const vtbl _binary_two_results_i; // float foo( float, float, int * )
-extern const vtbl _mad_tbl;             // float mad( float, float, float )
+extern const vtbl _mad_tbl; // float mad( float, float, float )
 
 #define unaryF &_unary
 #define i_unaryF &_i_unary
-#define unaryF_u  &_unary_u
+#define unaryF_u &_unary_u
 #define macro_unaryF &_macro_unary
 #define binaryF &_binary
 #define binaryF_nextafter &_binary_nextafter
@@ -127,10 +130,10 @@ extern const vtbl _mad_tbl;             // float mad( float, float, float )
 #define binaryF_i &_binary_i
 #define macro_binaryF &_macro_binary
 #define ternaryF &_ternary
-#define unaryF_two_results  &_unary_two_results
-#define unaryF_two_results_i  &_unary_two_results_i
-#define binaryF_two_results_i  &_binary_two_results_i
-#define mad_function        &_mad_tbl
+#define unaryF_two_results &_unary_two_results
+#define unaryF_two_results_i &_unary_two_results_i
+#define binaryF_two_results_i &_binary_two_results_i
+#define mad_function &_mad_tbl
 
 #endif // FUNCTION_LIST_ULPS_ONLY
 
@@ -325,4 +328,4 @@ const Func functionList[] = {
     OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
 };
 
-const size_t functionListCount = sizeof( functionList ) / sizeof( functionList[0] );
+const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]);
diff --git a/test_conformance/math_brute_force/FunctionList.h b/test_conformance/math_brute_force/FunctionList.h
index c22bceeb..e47eb729 100644
--- a/test_conformance/math_brute_force/FunctionList.h
+++ b/test_conformance/math_brute_force/FunctionList.h
@@ -22,80 +22,77 @@
 #include <unistd.h>
 #endif
 
-#if defined( __APPLE__ )
-    #include <OpenCL/opencl.h>
+#if defined(__APPLE__)
+#include <OpenCL/opencl.h>
 #else
-    #include <CL/cl.h>
+#include <CL/cl.h>
 #endif
 
 #include "harness/mt19937.h"
 
-typedef union fptr
-{
-    void    *p;
-    double  (*f_f)(double);
-    double  (*f_u)(cl_uint);
-    int     (*i_f)(double);
-    int     (*i_f_f)(float);
-    float   (*f_ff_f)(float, float);
-    double  (*f_ff)(double, double);
-    int     (*i_ff)(double, double);
-    double  (*f_fi)(double, int);
-    double  (*f_fpf)(double, double*);
-    double  (*f_fpI)(double, int*);
-    double  (*f_ffpI)(double, double, int*);
-    double  (*f_fff)(double, double, double );
-    float   (*f_fma)(float, float, float, int);
-}fptr;
+typedef union fptr {
+    void *p;
+    double (*f_f)(double);
+    double (*f_u)(cl_uint);
+    int (*i_f)(double);
+    int (*i_f_f)(float);
+    float (*f_ff_f)(float, float);
+    double (*f_ff)(double, double);
+    int (*i_ff)(double, double);
+    double (*f_fi)(double, int);
+    double (*f_fpf)(double, double *);
+    double (*f_fpI)(double, int *);
+    double (*f_ffpI)(double, double, int *);
+    double (*f_fff)(double, double, double);
+    float (*f_fma)(float, float, float, int);
+} fptr;
 
-typedef union dptr
-{
-    void            *p;
-    long double     (*f_f)(long double);
-    long double     (*f_u)(cl_ulong);
-    int             (*i_f)(long double);
-    long double     (*f_ff)(long double, long double);
-    int             (*i_ff)(long double, long double);
-    long double     (*f_fi)(long double, int);
-    long double     (*f_fpf)(long double, long double*);
-    long double     (*f_fpI)(long double, int*);
-    long double     (*f_ffpI)(long double, long double, int*);
-    long double     (*f_fff)(long double, long double, long double);
-}dptr;
+typedef union dptr {
+    void *p;
+    long double (*f_f)(long double);
+    long double (*f_u)(cl_ulong);
+    int (*i_f)(long double);
+    long double (*f_ff)(long double, long double);
+    int (*i_ff)(long double, long double);
+    long double (*f_fi)(long double, int);
+    long double (*f_fpf)(long double, long double *);
+    long double (*f_fpI)(long double, int *);
+    long double (*f_ffpI)(long double, long double, int *);
+    long double (*f_fff)(long double, long double, long double);
+} dptr;
 
 struct Func;
 
 typedef struct vtbl
 {
-    const char  *type_name;
+    const char *type_name;
     int (*TestFunc)(const struct Func *, MTdata, bool);
     int (*DoubleTestFunc)(
         const struct Func *, MTdata,
         bool); // may be NULL if function is single precision only
-}vtbl;
+} vtbl;
 
 typedef struct Func
 {
-  const char      *name;              // common name, to be used as an argument in the shell
-  const char      *nameInCode;        // name as it appears in the __kernel, usually the same as name, but different for multiplication
-  fptr            func;
-  dptr            dfunc;
-  fptr            rfunc;
-  float           float_ulps;
-  float           double_ulps;
-  float           float_embedded_ulps;
-  float           relaxed_error;
-  float relaxed_embedded_error;
-  int             ftz;
-  int             relaxed;
-  const vtbl      *vtbl_ptr;
-}Func;
+    const char *name; // common name, to be used as an argument in the shell
+    const char *nameInCode; // name as it appears in the __kernel, usually the
+                            // same as name, but different for multiplication
+    fptr func;
+    dptr dfunc;
+    fptr rfunc;
+    float float_ulps;
+    float double_ulps;
+    float float_embedded_ulps;
+    float relaxed_error;
+    float relaxed_embedded_error;
+    int ftz;
+    int relaxed;
+    const vtbl *vtbl_ptr;
+} Func;
 
 
-extern const Func  functionList[];
+extern const Func functionList[];
 
 extern const size_t functionListCount;
 
 #endif
-
-
diff --git a/test_conformance/math_brute_force/Sleep.cpp b/test_conformance/math_brute_force/Sleep.cpp
index 4d3b2c64..7103779e 100644
--- a/test_conformance/math_brute_force/Sleep.cpp
+++ b/test_conformance/math_brute_force/Sleep.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,103 +16,94 @@
 #include "Sleep.h"
 #include "Utility.h"
 
-#if defined( __APPLE__ )
-    #include <IOKit/pwr_mgt/IOPMLib.h>
-    #include <IOKit/IOMessage.h>
+#if defined(__APPLE__)
+#include <IOKit/pwr_mgt/IOPMLib.h>
+#include <IOKit/IOMessage.h>
 
-    struct
-    {
-        io_connect_t            connection;
-        IONotificationPortRef    port;
-        io_object_t                iterator;
-    }sleepInfo;
+struct
+{
+    io_connect_t connection;
+    IONotificationPortRef port;
+    io_object_t iterator;
+} sleepInfo;
 
-    void sleepCallback(    void *            refcon,
-                        io_service_t        service,
-                        natural_t        messageType,
-                        void *            messageArgument );
+void sleepCallback(void* refcon, io_service_t service, natural_t messageType,
+                   void* messageArgument);
 
-    void sleepCallback(    void *            refcon UNUSED,
-                        io_service_t        service UNUSED,
-                        natural_t        messageType,
-                        void *            messageArgument )
-    {
+void sleepCallback(void* refcon UNUSED, io_service_t service UNUSED,
+                   natural_t messageType, void* messageArgument)
+{
 
-        IOReturn result;
+    IOReturn result;
     /*
     service -- The IOService whose state has changed.
-    messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the IOService's family.
-    messageArgument -- An argument for the message, dependent on the messageType.
+    messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the
+    IOService's family. messageArgument -- An argument for the message,
+    dependent on the messageType.
     */
-        switch ( messageType )
-        {
-            case kIOMessageSystemWillSleep:
-                // Handle demand sleep (such as sleep caused by running out of
-                // batteries, closing the lid of a laptop, or selecting
-                // sleep from the Apple menu.
-                IOAllowPowerChange(sleepInfo.connection,(long)messageArgument);
-                vlog( "Hard sleep occurred.\n" );
-                break;
-            case kIOMessageCanSystemSleep:
-                // In this case, the computer has been idle for several minutes
-                // and will sleep soon so you must either allow or cancel
-                // this notification. Important: if you don’t respond, there will
-                // be a 30-second timeout before the computer sleeps.
-                // IOCancelPowerChange(root_port,(long)messageArgument);
-                result = IOCancelPowerChange(sleepInfo.connection,(long)messageArgument);
-                if( kIOReturnSuccess != result )
-                    vlog( "sleep prevention failed. (%d)\n", result);
+    switch (messageType)
+    {
+        case kIOMessageSystemWillSleep:
+            // Handle demand sleep (such as sleep caused by running out of
+            // batteries, closing the lid of a laptop, or selecting
+            // sleep from the Apple menu.
+            IOAllowPowerChange(sleepInfo.connection, (long)messageArgument);
+            vlog("Hard sleep occurred.\n");
+            break;
+        case kIOMessageCanSystemSleep:
+            // In this case, the computer has been idle for several minutes
+            // and will sleep soon so you must either allow or cancel
+            // this notification. Important: if you don’t respond, there will
+            // be a 30-second timeout before the computer sleeps.
+            // IOCancelPowerChange(root_port,(long)messageArgument);
+            result = IOCancelPowerChange(sleepInfo.connection,
+                                         (long)messageArgument);
+            if (kIOReturnSuccess != result)
+                vlog("sleep prevention failed. (%d)\n", result);
+            break;
+        case kIOMessageSystemHasPoweredOn:
+            // Handle wakeup.
             break;
-            case kIOMessageSystemHasPoweredOn:
-                // Handle wakeup.
-                break;
-        }
     }
+}
 #endif
 
 
-
-
-
-void PreventSleep( void )
+void PreventSleep(void)
 {
-#if defined( __APPLE__ )
-    vlog( "Disabling sleep... " );
-    sleepInfo.iterator = (io_object_t) 0;
+#if defined(__APPLE__)
+    vlog("Disabling sleep... ");
+    sleepInfo.iterator = (io_object_t)0;
     sleepInfo.port = NULL;
-    sleepInfo.connection = IORegisterForSystemPower
-                            (
-                                &sleepInfo,                    //void * refcon,
-                                &sleepInfo.port,            //IONotificationPortRef * thePortRef,
-                                sleepCallback,                //IOServiceInterestCallback callback,
-                                &sleepInfo.iterator            //io_object_t * notifier
-                            );
+    sleepInfo.connection = IORegisterForSystemPower(
+        &sleepInfo, // void * refcon,
+        &sleepInfo.port, // IONotificationPortRef * thePortRef,
+        sleepCallback, // IOServiceInterestCallback callback,
+        &sleepInfo.iterator // io_object_t * notifier
+    );
 
-    if( (io_connect_t) 0 == sleepInfo.connection )
-        vlog( "failed.\n" );
+    if ((io_connect_t)0 == sleepInfo.connection)
+        vlog("failed.\n");
     else
-        vlog( "done.\n" );
+        vlog("done.\n");
 
     CFRunLoopAddSource(CFRunLoopGetCurrent(),
-                        IONotificationPortGetRunLoopSource(sleepInfo.port),
-                        kCFRunLoopDefaultMode);
+                       IONotificationPortGetRunLoopSource(sleepInfo.port),
+                       kCFRunLoopDefaultMode);
 #else
-    vlog( "*** PreventSleep() is not implemented on this platform.\n" );
+    vlog("*** PreventSleep() is not implemented on this platform.\n");
 #endif
 }
 
-void ResumeSleep( void )
+void ResumeSleep(void)
 {
-#if defined( __APPLE__ )
-    IOReturn result = IODeregisterForSystemPower ( &sleepInfo.iterator );
-    if( 0 != result )
-        vlog( "Got error %d restoring sleep \n", result );
+#if defined(__APPLE__)
+    IOReturn result = IODeregisterForSystemPower(&sleepInfo.iterator);
+    if (0 != result)
+        vlog("Got error %d restoring sleep \n", result);
     else
-        vlog( "Sleep restored.\n" );
+        vlog("Sleep restored.\n");
 #else
-    vlog( "*** ResumeSleep() is not implemented on this platform.\n" );
+    vlog("*** ResumeSleep() is not implemented on this platform.\n");
 #endif
 }
-
-
-
diff --git a/test_conformance/math_brute_force/Sleep.h b/test_conformance/math_brute_force/Sleep.h
index f983a32f..ca643954 100644
--- a/test_conformance/math_brute_force/Sleep.h
+++ b/test_conformance/math_brute_force/Sleep.h
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,9 +16,7 @@
 #ifndef SLEEP_H
 #define SLEEP_H
 
-void PreventSleep( void );
-void ResumeSleep( void );
+void PreventSleep(void);
+void ResumeSleep(void);
 
 #endif /* SLEEP_H */
-
-
diff --git a/test_conformance/math_brute_force/Utility.cpp b/test_conformance/math_brute_force/Utility.cpp
index 9ab7c7fa..3d8d9baa 100644
--- a/test_conformance/math_brute_force/Utility.cpp
+++ b/test_conformance/math_brute_force/Utility.cpp
@@ -17,9 +17,9 @@
 #include "FunctionList.h"
 
 #if defined(__PPC__)
-// Global varaiable used to hold the FPU control register state. The FPSCR register can not
-// be used because not all Power implementations retain or observed the NI (non-IEEE
-// mode) bit.
+// Global varaiable used to hold the FPU control register state. The FPSCR
+// register can not be used because not all Power implementations retain or
+// observed the NI (non-IEEE mode) bit.
 __thread fpu_control_t fpu_control = 0;
 #endif
 
@@ -28,16 +28,16 @@ void MulD(double *rhi, double *rlo, double u, double v)
     const double c = 134217729.0; // 1+2^27
     double up, u1, u2, vp, v1, v2;
 
-    up = u*c;
+    up = u * c;
     u1 = (u - up) + up;
     u2 = u - u1;
 
-    vp = v*c;
+    vp = v * c;
     v1 = (v - vp) + vp;
     v2 = v - v1;
 
-    double rh = u*v;
-    double rl = (((u1*v1 - rh) + (u1*v2)) + (u2*v1)) + (u2*v2);
+    double rh = u * v;
+    double rl = (((u1 * v1 - rh) + (u1 * v2)) + (u2 * v1)) + (u2 * v2);
 
     *rhi = rh;
     *rlo = rl;
@@ -47,11 +47,13 @@ void AddD(double *rhi, double *rlo, double a, double b)
 {
     double zhi, zlo;
     zhi = a + b;
-    if(fabs(a) > fabs(b)) {
+    if (fabs(a) > fabs(b))
+    {
         zlo = zhi - a;
         zlo = b - zlo;
     }
-    else {
+    else
+    {
         zlo = zhi - b;
         zlo = a - zlo;
     }
@@ -66,17 +68,17 @@ void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
     double c = 134217729.0;
     double up, u1, u2, vp, v1, v2;
 
-    up = xh*c;
+    up = xh * c;
     u1 = (xh - up) + up;
     u2 = xh - u1;
 
-    vp = yh*c;
+    vp = yh * c;
     v1 = (yh - vp) + vp;
     v2 = yh - v1;
 
-    mh = xh*yh;
-    ml = (((u1*v1 - mh) + (u1*v2)) + (u2*v1)) + (u2*v2);
-    ml += xh*yl + xl*yh;
+    mh = xh * yh;
+    ml = (((u1 * v1 - mh) + (u1 * v2)) + (u2 * v1)) + (u2 * v2);
+    ml += xh * yl + xl * yh;
 
     *rhi = mh + ml;
     *rlo = (mh - (*rhi)) + ml;
@@ -86,7 +88,8 @@ void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
 {
     double r, s;
     r = xh + yh;
-    s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) : (yh - r + xh + xl + yl);
+    s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl)
+                              : (yh - r + xh + xl + yl);
     *rhi = r + s;
     *rlo = (r - (*rhi)) + s;
 }
@@ -100,72 +103,61 @@ void DivideDD(double *chi, double *clo, double a, double b)
     *clo = rhi / b;
 }
 
-// These functions comapre two floats/doubles. Since some platforms may choose to
-// flush denormals to zeros before comparison, comparison like a < b may give wrong
-// result in "certain cases" where we do need correct compasion result when operands
-// are denormals .... these functions comapre floats/doubles using signed integer/long int
-// rep. In other cases, when flushing to zeros is fine, these should not be used.
-// Also these doesn't check for nans and assume nans are handled separately as special edge case
-// by the caller which calls these functions
-// return 0 if both are equal, 1 if x > y and -1 if x < y.
+// These functions comapre two floats/doubles. Since some platforms may choose
+// to flush denormals to zeros before comparison, comparison like a < b may give
+// wrong result in "certain cases" where we do need correct compasion result
+// when operands are denormals .... these functions comapre floats/doubles using
+// signed integer/long int rep. In other cases, when flushing to zeros is fine,
+// these should not be used. Also these doesn't check for nans and assume nans
+// are handled separately as special edge case by the caller which calls these
+// functions return 0 if both are equal, 1 if x > y and -1 if x < y.
 
-inline
-int compareFloats(float x, float y)
+inline int compareFloats(float x, float y)
 {
     int32f_t a, b;
 
     a.f = x;
     b.f = y;
 
-    if( a.i & 0x80000000 )
-        a.i = 0x80000000 - a.i;
-    if( b.i & 0x80000000 )
-        b.i = 0x80000000 - b.i;
+    if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
+    if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
 
-    if( a.i == b.i )
-        return 0;
+    if (a.i == b.i) return 0;
 
     return a.i < b.i ? -1 : 1;
 }
 
-inline
-int compareDoubles(double x, double y)
+inline int compareDoubles(double x, double y)
 {
     int64d_t a, b;
 
     a.d = x;
     b.d = y;
 
-    if( a.l & 0x8000000000000000LL )
-        a.l = 0x8000000000000000LL - a.l;
-    if( b.l & 0x8000000000000000LL )
-        b.l = 0x8000000000000000LL - b.l;
+    if (a.l & 0x8000000000000000LL) a.l = 0x8000000000000000LL - a.l;
+    if (b.l & 0x8000000000000000LL) b.l = 0x8000000000000000LL - b.l;
 
-    if( a.l == b.l )
-        return 0;
+    if (a.l == b.l) return 0;
 
     return a.l < b.l ? -1 : 1;
 }
 
-void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed)
+void logFunctionInfo(const char *fname, unsigned int float_size,
+                     unsigned int isFastRelaxed)
 {
     char const *fpSizeStr = NULL;
     char const *fpFastRelaxedStr = "";
-    switch (float_size) {
-    case sizeof(cl_double):
-        fpSizeStr = "fp64";
-        break;
-    case sizeof(cl_float):
-        fpSizeStr = "fp32";
-        break;
-    case sizeof(cl_half):
-        fpSizeStr = "fp16";
-        break;
+    switch (float_size)
+    {
+        case sizeof(cl_double): fpSizeStr = "fp64"; break;
+        case sizeof(cl_float): fpSizeStr = "fp32"; break;
+        case sizeof(cl_half): fpSizeStr = "fp16"; break;
     }
-    if (isFastRelaxed) {
+    if (isFastRelaxed)
+    {
         fpFastRelaxedStr = "rlx";
     }
-    vlog("%15s %4s %4s",fname, fpSizeStr, fpFastRelaxedStr);
+    vlog("%15s %4s %4s", fname, fpSizeStr, fpFastRelaxedStr);
 }
 
 float getAllowedUlpError(const Func *f, const bool relaxed)
diff --git a/test_conformance/math_brute_force/Utility.h b/test_conformance/math_brute_force/Utility.h
index 92f8f3dc..dd3c5e56 100644
--- a/test_conformance/math_brute_force/Utility.h
+++ b/test_conformance/math_brute_force/Utility.h
@@ -30,13 +30,13 @@
 #include "harness/ThreadPool.h"
 #include "harness/conversions.h"
 
-#define BUFFER_SIZE         (1024*1024*2)
+#define BUFFER_SIZE (1024 * 1024 * 2)
 #define EMBEDDED_REDUCTION_FACTOR (64)
 
-#if defined( __GNUC__ )
-    #define UNUSED  __attribute__ ((unused))
+#if defined(__GNUC__)
+#define UNUSED __attribute__((unused))
 #else
-    #define UNUSED
+#define UNUSED
 #endif
 
 struct Func;
@@ -44,62 +44,62 @@ struct Func;
 extern int gWimpyBufferSize;
 extern int gWimpyReductionFactor;
 
-#define VECTOR_SIZE_COUNT   6
+#define VECTOR_SIZE_COUNT 6
 extern const char *sizeNames[VECTOR_SIZE_COUNT];
-extern const int   sizeValues[VECTOR_SIZE_COUNT];
+extern const int sizeValues[VECTOR_SIZE_COUNT];
 
-extern cl_device_id     gDevice;
-extern cl_context       gContext;
+extern cl_device_id gDevice;
+extern cl_context gContext;
 extern cl_command_queue gQueue;
-extern void             *gIn;
-extern void             *gIn2;
-extern void             *gIn3;
-extern void             *gOut_Ref;
-extern void             *gOut_Ref2;
-extern void             *gOut[VECTOR_SIZE_COUNT];
-extern void             *gOut2[VECTOR_SIZE_COUNT];
-extern cl_mem           gInBuffer;
-extern cl_mem           gInBuffer2;
-extern cl_mem           gInBuffer3;
-extern cl_mem           gOutBuffer[VECTOR_SIZE_COUNT];
-extern cl_mem           gOutBuffer2[VECTOR_SIZE_COUNT];
-extern uint32_t         gComputeDevices;
-extern uint32_t         gSimdSize;
-extern int              gSkipCorrectnessTesting;
-extern int              gMeasureTimes;
-extern int              gReportAverageTimes;
-extern int              gForceFTZ;
-extern int              gFastRelaxedDerived;
-extern int              gWimpyMode;
-extern int              gHasDouble;
-extern int              gIsInRTZMode;
-extern int              gInfNanSupport;
-extern int              gIsEmbedded;
-extern int              gVerboseBruteForce;
-extern uint32_t         gMaxVectorSizeIndex;
-extern uint32_t         gMinVectorSizeIndex;
-extern uint32_t         gDeviceFrequency;
+extern void *gIn;
+extern void *gIn2;
+extern void *gIn3;
+extern void *gOut_Ref;
+extern void *gOut_Ref2;
+extern void *gOut[VECTOR_SIZE_COUNT];
+extern void *gOut2[VECTOR_SIZE_COUNT];
+extern cl_mem gInBuffer;
+extern cl_mem gInBuffer2;
+extern cl_mem gInBuffer3;
+extern cl_mem gOutBuffer[VECTOR_SIZE_COUNT];
+extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT];
+extern uint32_t gComputeDevices;
+extern uint32_t gSimdSize;
+extern int gSkipCorrectnessTesting;
+extern int gMeasureTimes;
+extern int gReportAverageTimes;
+extern int gForceFTZ;
+extern int gFastRelaxedDerived;
+extern int gWimpyMode;
+extern int gHasDouble;
+extern int gIsInRTZMode;
+extern int gInfNanSupport;
+extern int gIsEmbedded;
+extern int gVerboseBruteForce;
+extern uint32_t gMaxVectorSizeIndex;
+extern uint32_t gMinVectorSizeIndex;
+extern uint32_t gDeviceFrequency;
 extern cl_device_fp_config gFloatCapabilities;
 extern cl_device_fp_config gDoubleCapabilities;
 
-#define LOWER_IS_BETTER     0
-#define HIGHER_IS_BETTER    1
+#define LOWER_IS_BETTER 0
+#define HIGHER_IS_BETTER 1
 
 #include "harness/errorHelpers.h"
 
-#if defined (_MSC_VER )
-    //Deal with missing scalbn on windows
-    #define scalbnf( _a, _i )       ldexpf( _a, _i )
-    #define scalbn( _a, _i )        ldexp( _a, _i )
-    #define scalbnl( _a, _i )       ldexpl( _a, _i )
+#if defined(_MSC_VER)
+// Deal with missing scalbn on windows
+#define scalbnf(_a, _i) ldexpf(_a, _i)
+#define scalbn(_a, _i) ldexp(_a, _i)
+#define scalbnl(_a, _i) ldexpl(_a, _i)
 #endif
 
-float Abs_Error( float test, double reference );
-float Ulp_Error( float test, double reference );
-float Bruteforce_Ulp_Error_Double( double test, long double reference );
+float Abs_Error(float test, double reference);
+float Ulp_Error(float test, double reference);
+float Bruteforce_Ulp_Error_Double(double test, long double reference);
 
-uint64_t GetTime( void );
-double SubtractTime( uint64_t endTime, uint64_t startTime );
+uint64_t GetTime(void);
+double SubtractTime(uint64_t endTime, uint64_t startTime);
 int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k,
                cl_program *p, bool relaxedMode);
 int MakeKernels(const char **c, cl_uint count, const char *name,
@@ -107,69 +107,84 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
                 bool relaxedMode);
 
 // used to convert a bucket of bits into a search pattern through double
-static inline double DoubleFromUInt32( uint32_t bits );
-static inline double DoubleFromUInt32( uint32_t bits )
+static inline double DoubleFromUInt32(uint32_t bits);
+static inline double DoubleFromUInt32(uint32_t bits)
 {
-    union{ uint64_t u; double d;} u;
+    union {
+        uint64_t u;
+        double d;
+    } u;
 
     // split 0x89abcdef to 0x89abc00000000def
     u.u = bits & 0xfffU;
-    u.u |= (uint64_t) (bits & ~0xfffU) << 32;
+    u.u |= (uint64_t)(bits & ~0xfffU) << 32;
 
-    // sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s
+    // sign extend the leading bit of def segment as sign bit so that the middle
+    // region consists of either all 1s or 0s
     u.u -= (bits & 0x800U) << 1;
 
     // return result
     return u.d;
 }
 
-void _LogBuildError( cl_program p, int line, const char *file );
-#define LogBuildError( program )        _LogBuildError( program, __LINE__, __FILE__ )
+void _LogBuildError(cl_program p, int line, const char *file);
+#define LogBuildError(program) _LogBuildError(program, __LINE__, __FILE__)
 
 #define PERF_LOOP_COUNT 100
 
-//The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero.
-// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero.
-static inline int IsFloatResultSubnormal( double x, float ulps )
+// The spec is fairly clear that we may enforce a hard cutoff to prevent
+// premature flushing to zero.
+// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +
+// ulp_limit to be flushed to zero.
+static inline int IsFloatResultSubnormal(double x, float ulps)
 {
-    x = fabs(x) - MAKE_HEX_DOUBLE( 0x1.0p-149, 0x1, -149) * (double) ulps;
-    return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
+    x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps;
+    return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
-static inline int IsFloatResultSubnormalAbsError( double x , float abs_err)
+static inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
 {
-  x = x - abs_err;
-  return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
+    x = x - abs_err;
+    return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
-static inline int IsDoubleResultSubnormal( long double x, float ulps )
+static inline int IsDoubleResultSubnormal(long double x, float ulps)
 {
-    x = fabsl(x) - MAKE_HEX_LONG( 0x1.0p-1074, 0x1, -1074) * (long double) ulps;
-    return x < MAKE_HEX_LONG( 0x1.0p-1022, 0x1, -1022 );
+    x = fabsl(x) - MAKE_HEX_LONG(0x1.0p-1074, 0x1, -1074) * (long double)ulps;
+    return x < MAKE_HEX_LONG(0x1.0p-1022, 0x1, -1022);
 }
 
 static inline int IsFloatInfinity(double x)
 {
-  union { cl_float d; cl_uint u; } u;
-  u.d = (cl_float) x;
-  return ((u.u & 0x7fffffffU) == 0x7F800000U);
+    union {
+        cl_float d;
+        cl_uint u;
+    } u;
+    u.d = (cl_float)x;
+    return ((u.u & 0x7fffffffU) == 0x7F800000U);
 }
 
 static inline int IsFloatMaxFloat(double x)
 {
-  union { cl_float d; cl_uint u; } u;
-  u.d = (cl_float) x;
-  return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
+    union {
+        cl_float d;
+        cl_uint u;
+    } u;
+    u.d = (cl_float)x;
+    return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
 }
 
 static inline int IsFloatNaN(double x)
 {
-  union { cl_float d; cl_uint u; } u;
-  u.d = (cl_float) x;
-  return ((u.u & 0x7fffffffU) > 0x7F800000U);
+    union {
+        cl_float d;
+        cl_uint u;
+    } u;
+    u.d = (cl_float)x;
+    return ((u.u & 0x7fffffffU) > 0x7F800000U);
 }
 
-extern cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
+extern cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
 
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
@@ -186,46 +201,50 @@ static inline void Force64BitFPUPrecision(void)
     // divergent code just use inline assembly which works for both.
     unsigned short int orig_cw = 0;
     unsigned short int new_cw = 0;
-    __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
-    new_cw = orig_cw | 0x0300;   // set precision to 64-bit
-    __asm__ __volatile__ ("fldcw  %0"::"m" (new_cw));
-#elif defined( _WIN32 ) && defined( __INTEL_COMPILER )
-    // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* work on win.x64:
-    // > On the x64 architecture, changing the floating point precision is not supported.
-    // (Taken from http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
+    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
+    new_cw = orig_cw | 0x0300; // set precision to 64-bit
+    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
+#elif defined(_WIN32) && defined(__INTEL_COMPILER)
+    // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not*
+    // work on win.x64: > On the x64 architecture, changing the floating point
+    // precision is not supported. (Taken from
+    // http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
     int cw;
-    __asm { fnstcw cw };    // Get current value of FPU control word.
-    cw = cw & 0xfffffcff | ( 3 << 8 ); // Set Precision Control to Double Extended Precision.
-    __asm { fldcw cw };     // Set new value of FPU control word.
+    __asm { fnstcw cw }
+    ; // Get current value of FPU control word.
+    cw = cw & 0xfffffcff
+        | (3 << 8); // Set Precision Control to Double Extended Precision.
+    __asm { fldcw cw }
+    ; // Set new value of FPU control word.
 #else
     /* Implement for other platforms if needed */
 #endif
 }
 
-extern
-void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
+extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
 
-typedef union
-{
+typedef union {
     int32_t i;
-    float   f;
-}int32f_t;
+    float f;
+} int32f_t;
 
-typedef union
-{
+typedef union {
     int64_t l;
-    double  d;
-}int64d_t;
+    double d;
+} int64d_t;
 
 void MulD(double *rhi, double *rlo, double u, double v);
 void AddD(double *rhi, double *rlo, double a, double b);
-void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
-void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
+void MulDD(double *rhi, double *rlo, double xh, double xl, double yh,
+           double yl);
+void AddDD(double *rhi, double *rlo, double xh, double xl, double yh,
+           double yl);
 void DivideDD(double *chi, double *clo, double a, double b);
 int compareFloats(float x, float y);
 int compareDoubles(double x, double y);
 
-void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed);
+void logFunctionInfo(const char *fname, unsigned int float_size,
+                     unsigned int isFastRelaxed);
 
 float getAllowedUlpError(const Func *f, const bool relaxed);
 
diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index 0b8be27b..db961c8d 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -46,63 +46,82 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {     "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       f0 = ", name, "( f0, f1 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, f1 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -112,65 +131,84 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
 {
-    const char *c[] = {     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       d0 = ", name, "( d0, d1 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = ", name, "( d0, d1 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -178,115 +216,215 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
 
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
                              info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value (param 1).  Init to 0.
-    double      maxErrorValue2;                     // position of the max error value (param 2).  Init to 0.
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
 
-    int         isFDim;
-    int         skipNanInf;
-    int         isNextafter;
+    int isFDim;
+    int skipNanInf;
+    int isNextafter;
     bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
 } TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
                                       bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
-    int         skipTestingRelaxed = 0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+    int skipTestingRelaxed = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
-    if (gWimpyMode){
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -296,62 +434,83 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
 
-    test_info.isFDim = 0 == strcmp( "fdim", f->nameInCode );
-    test_info.skipNanInf = test_info.isFDim  && ! gInfNanSupport;
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
     test_info.isNextafter = isNextafter;
     test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf2 )
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer2 for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gOutBuffer[%d] for region {%zd, %zd}\n", (int) j, region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -364,19 +523,21 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
     // Run the kernels
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -384,176 +545,200 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
             p2[j] = 0x3fc00000;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            free_mtdata( test_info.tinfo[i].d );
+            free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    fptr        func = job->f->func;
-    int         ftz = job->ftz;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    cl_uchar    *overflow = (cl_uchar*)malloc(buffer_size);
-    const char  *name = job->f->name;
-    int         isFDim = job->isFDim;
-    int         skipNanInf = job->skipNanInf;
-    int         isNextafter = job->isNextafter;
-    cl_uint     *t = 0;
-    float       *r=0,*s=0,*s2=0;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    const char *name = job->f->name;
+    int isFDim = job->isFDim;
+    int skipNanInf = job->skipNanInf;
+    int isNextafter = job->isNextafter;
+    cl_uint *t = 0;
+    float *r = 0, *s = 0, *s2 = 0;
     cl_int copysign_test = 0;
     RoundingMode oldRoundMode;
     int skipVerification = 0;
 
     if (relaxedMode)
     {
-      if (strcmp(name,"pow")==0 && gFastRelaxedDerived)
-      {
-        func = job->f->rfunc;
-        ulps = INFINITY;
-        skipVerification = 1;
-      }else
-      {
-        func = job->f->rfunc;
-      }
+        if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
+        {
+            func = job->f->rfunc;
+            ulps = INFINITY;
+            skipVerification = 1;
+        }
+        else
+        {
+            func = job->f->rfunc;
+        }
     }
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
     if (job_id <= (cl_uint)indx)
@@ -562,91 +747,111 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
         float *fp2 = (float *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesFloatCount;
-    y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if( ++x >= specialValuesFloatCount )
+            if (++x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesFloatCount )
-                    break;
+                if (y >= specialValuesFloatCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int32(d);
         p2[j] = genrand_int32(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
+    if (gSkipCorrectnessTesting)
     {
-        if( (error = clFinish(tinfo->tQueue)) )
+        if ((error = clFinish(tinfo->tQueue)))
         {
-          vlog_error( "Error: clFinish failed! err: %d\n", error );
-          goto exit;
+            vlog_error("Error: clFinish failed! err: %d\n", error);
+            goto exit;
         }
         free(overflow);
         return CL_SUCCESS;
@@ -654,105 +859,111 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
 
     FPU_mode_type oldMode;
     oldRoundMode = kRoundToNearestEven;
-    if( isFDim )
+    if (isFDim)
     {
-        //Calculate the correctly rounded reference result
-        memset( &oldMode, 0, sizeof( oldMode ) );
-        if( ftz )
-            ForceFTZ( &oldMode );
+        // Calculate the correctly rounded reference result
+        memset(&oldMode, 0, sizeof(oldMode));
+        if (ftz) ForceFTZ(&oldMode);
 
         // Set the rounding mode to match the device
-        if (gIsInRTZMode)
-            oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
     }
 
-    if(!strcmp(name, "copysign"))
-        copysign_test = 1;
+    if (!strcmp(name, "copysign")) copysign_test = 1;
 
-#define ref_func(s, s2) (copysign_test ? func.f_ff_f( s, s2 ) : func.f_ff( s, s2 ))
+#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
 
-    //Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (float *)gIn2  + thread_id * buffer_elements;
-    if( skipNanInf )
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    if (skipNanInf)
     {
-        for( j = 0; j < buffer_elements; j++ )
+        for (j = 0; j < buffer_elements; j++)
         {
             feclearexcept(FE_OVERFLOW);
-            r[j] = (float) ref_func( s[j], s2[j] );
-            overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            r[j] = (float)ref_func(s[j], s2[j]);
+            overflow[j] =
+                FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
         }
     }
     else
     {
-        for( j = 0; j < buffer_elements; j++ )
-            r[j] = (float) ref_func( s[j], s2[j] );
+        for (j = 0; j < buffer_elements; j++)
+            r[j] = (float)ref_func(s[j], s2[j]);
     }
 
-    if( isFDim && ftz )
-        RestoreFPState( &oldMode );
+    if (isFDim && ftz) RestoreFPState(&oldMode);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    if (!skipVerification) {
-        //Verify data
+    if (!skipVerification)
+    {
+        // Verify data
         t = (cl_uint *)r;
-        for( j = 0; j < buffer_elements; j++ )
+        for (j = 0; j < buffer_elements; j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 cl_uint *q = out[k];
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    float test = ((float*) q)[j];
-                    double correct = ref_func( s[j], s2[j] );
+                    float test = ((float *)q)[j];
+                    double correct = ref_func(s[j], s2[j]);
 
-                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                    // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                    // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                    // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow As per
+                    // OpenCL 2.0 spec, section 5.8.4.3, enabling
+                    // fast-relaxed-math mode also enables -cl-finite-math-only
+                    // optimization. This optimization allows to assume that
+                    // arguments and results are not NaNs or +/-INFs. Hence,
+                    // accept any result if inputs or results are NaNs or INFs.
                     if (relaxedMode || skipNanInf)
                     {
-                        if( skipNanInf && overflow[j])
-                            continue;
-                        // Note: no double rounding here.  Reference functions calculate in single precision.
-                        if( IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                            IsFloatInfinity(s2[j])   || IsFloatNaN(s2[j])       ||
-                            IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        )
+                        if (skipNanInf && overflow[j]) continue;
+                        // Note: no double rounding here.  Reference functions
+                        // calculate in single precision.
+                        if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                            || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
+                            || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
                             continue;
                     }
 
-                    float err = Ulp_Error( test, correct );
-                    int fail = ! (fabsf(err) <= ulps);
+                    float err = Ulp_Error(test, correct);
+                    int fail = !(fabsf(err) <= ulps);
 
-                    if( fail && ftz )
+                    if (fail && ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsFloatResultSubnormal(correct, ulps ) )
+                        if (IsFloatResultSubnormal(correct, ulps))
                         {
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // nextafter on FTZ platforms may return the smallest
@@ -765,171 +976,203 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
                         // normal number is the next representable number.
                         // In which case, it should have the same sign as the
                         // second argument.
-                        if (isNextafter )
+                        if (isNextafter)
                         {
-                            if(IsFloatSubnormal(s[j]) || s[j] == 0.0f)
+                            if (IsFloatSubnormal(s[j]) || s[j] == 0.0f)
                             {
                                 float value = copysignf(twoToMinus126, s2[j]);
                                 fail = fail && (test != value);
-                                if (!fail)
-                                    err = 0.0f;
+                                if (!fail) err = 0.0f;
                             }
                         }
                         else
                         {
                             // retry per section 6.5.3.3
-                            if( IsFloatSubnormal( s[j] ) )
+                            if (IsFloatSubnormal(s[j]))
                             {
                                 double correct2, correct3;
                                 float err2, err3;
 
-                                if( skipNanInf )
-                                    feclearexcept(FE_OVERFLOW);
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                correct2 = ref_func( 0.0, s2[j] );
-                                correct3 = ref_func( -0.0, s2[j] );
+                                correct2 = ref_func(0.0, s2[j]);
+                                correct3 = ref_func(-0.0, s2[j]);
 
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                                // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                                // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow As per OpenCL 2.0 spec,
+                                // section 5.8.4.3, enabling fast-relaxed-math
+                                // mode also enables -cl-finite-math-only
+                                // optimization. This optimization allows to
+                                // assume that arguments and results are not
+                                // NaNs or +/-INFs. Hence, accept any result if
+                                // inputs or results are NaNs or INFs.
                                 if (relaxedMode || skipNanInf)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) && skipNanInf )
+                                    if (fetestexcept(FE_OVERFLOW) && skipNanInf)
                                         continue;
 
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                        IsFloatInfinity(correct3) || IsFloatNaN(correct3)    )
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
 
                                 // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                                if (IsFloatResultSubnormal(correct2, ulps)
+                                    || IsFloatResultSubnormal(correct3, ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
 
-                                //try with both args as zero
-                                if( IsFloatSubnormal( s2[j] )  )
+                                // try with both args as zero
+                                if (IsFloatSubnormal(s2[j]))
                                 {
                                     double correct4, correct5;
                                     float err4, err5;
 
-                                    if( skipNanInf )
-                                        feclearexcept(FE_OVERFLOW);
+                                    if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                    correct2 = ref_func( 0.0, 0.0 );
-                                    correct3 = ref_func( -0.0, 0.0 );
-                                    correct4 = ref_func( 0.0, -0.0 );
-                                    correct5 = ref_func( -0.0, -0.0 );
+                                    correct2 = ref_func(0.0, 0.0);
+                                    correct3 = ref_func(-0.0, 0.0);
+                                    correct4 = ref_func(0.0, -0.0);
+                                    correct5 = ref_func(-0.0, -0.0);
 
-                                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                    // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                                    // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                                    // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                                    // Per section 10 paragraph 6, accept any
+                                    // result if an input or output is a
+                                    // infinity or NaN or overflow As per
+                                    // OpenCL 2.0 spec, section 5.8.4.3,
+                                    // enabling fast-relaxed-math mode also
+                                    // enables -cl-finite-math-only
+                                    // optimization. This optimization allows to
+                                    // assume that arguments and results are not
+                                    // NaNs or +/-INFs. Hence, accept any result
+                                    // if inputs or results are NaNs or INFs.
                                     if (relaxedMode || skipNanInf)
                                     {
-                                        if( fetestexcept(FE_OVERFLOW) && skipNanInf )
+                                        if (fetestexcept(FE_OVERFLOW)
+                                            && skipNanInf)
                                             continue;
 
-                                        // Note: no double rounding here.  Reference functions calculate in single precision.
-                                        if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                            IsFloatInfinity(correct3) || IsFloatNaN(correct3)   ||
-                                            IsFloatInfinity(correct4) || IsFloatNaN(correct4)   ||
-                                            IsFloatInfinity(correct5) || IsFloatNaN(correct5)    )
+                                        // Note: no double rounding here.
+                                        // Reference functions calculate in
+                                        // single precision.
+                                        if (IsFloatInfinity(correct2)
+                                            || IsFloatNaN(correct2)
+                                            || IsFloatInfinity(correct3)
+                                            || IsFloatNaN(correct3)
+                                            || IsFloatInfinity(correct4)
+                                            || IsFloatNaN(correct4)
+                                            || IsFloatInfinity(correct5)
+                                            || IsFloatNaN(correct5))
                                             continue;
                                     }
 
-                                    err2 = Ulp_Error( test, correct2  );
-                                    err3 = Ulp_Error( test, correct3  );
-                                    err4 = Ulp_Error( test, correct4  );
-                                    err5 = Ulp_Error( test, correct5  );
-                                    fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                                     (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
+                                    err2 = Ulp_Error(test, correct2);
+                                    err3 = Ulp_Error(test, correct3);
+                                    err4 = Ulp_Error(test, correct4);
+                                    err5 = Ulp_Error(test, correct5);
+                                    fail = fail
+                                        && ((!(fabsf(err2) <= ulps))
+                                            && (!(fabsf(err3) <= ulps))
+                                            && (!(fabsf(err4) <= ulps))
+                                            && (!(fabsf(err5) <= ulps)));
+                                    if (fabsf(err2) < fabsf(err)) err = err2;
+                                    if (fabsf(err3) < fabsf(err)) err = err3;
+                                    if (fabsf(err4) < fabsf(err)) err = err4;
+                                    if (fabsf(err5) < fabsf(err)) err = err5;
 
                                     // retry per section 6.5.3.4
-                                    if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ||
-                                        IsFloatResultSubnormal( correct4, ulps ) || IsFloatResultSubnormal( correct5, ulps ) )
+                                    if (IsFloatResultSubnormal(correct2, ulps)
+                                        || IsFloatResultSubnormal(correct3,
+                                                                  ulps)
+                                        || IsFloatResultSubnormal(correct4,
+                                                                  ulps)
+                                        || IsFloatResultSubnormal(correct5,
+                                                                  ulps))
                                     {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
+                                        fail = fail && (test != 0.0f);
+                                        if (!fail) err = 0.0f;
                                     }
                                 }
                             }
-                            else if(IsFloatSubnormal(s2[j]) )
+                            else if (IsFloatSubnormal(s2[j]))
                             {
                                 double correct2, correct3;
                                 float err2, err3;
 
-                                if( skipNanInf )
-                                    feclearexcept(FE_OVERFLOW);
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                correct2 = ref_func( s[j], 0.0 );
-                                correct3 = ref_func( s[j], -0.0 );
+                                correct2 = ref_func(s[j], 0.0);
+                                correct3 = ref_func(s[j], -0.0);
 
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables
-                                // -cl-finite-math-only optimization. This optimization allows to assume that arguments and
-                                // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs.
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow As per OpenCL 2.0 spec,
+                                // section 5.8.4.3, enabling fast-relaxed-math
+                                // mode also enables -cl-finite-math-only
+                                // optimization. This optimization allows to
+                                // assume that arguments and results are not
+                                // NaNs or +/-INFs. Hence, accept any result if
+                                // inputs or results are NaNs or INFs.
                                 if (relaxedMode || skipNanInf)
                                 {
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( overflow[j] && skipNanInf)
-                                        continue;
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (overflow[j] && skipNanInf) continue;
 
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                        IsFloatInfinity(correct3) || IsFloatNaN(correct3)    )
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
 
                                 // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                                if (IsFloatResultSubnormal(correct2, ulps)
+                                    || IsFloatResultSubnormal(correct3, ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
                     }
 
-                    if( fabsf(err ) > tinfo->maxError )
+                    if (fabsf(err) > tinfo->maxError)
                     {
                         tinfo->maxError = fabsf(err);
                         tinfo->maxErrorValue = s[j];
                         tinfo->maxErrorValue2 = s2[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a (0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n", name, sizeNames[k], err, s[j], ((cl_uint*)s)[j], s2[j], ((cl_uint*)s2)[j], r[j], test, ((cl_uint*)&test)[0], j );
+                        vlog_error(
+                            "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a "
+                            "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n",
+                            name, sizeNames[k], err, s[j], ((cl_uint *)s)[j],
+                            s2[j], ((cl_uint *)s2)[j], r[j], test,
+                            ((cl_uint *)&test)[0], j);
                         error = -1;
                         goto exit;
                     }
@@ -938,93 +1181,192 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
         }
     }
 
-    if (isFDim && gIsInRTZMode)
-        (void)set_round(oldRoundMode, kfloat);
+    if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements,  job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 
 
 exit:
-    if( overflow )
-        free( overflow );
+    if (overflow) free(overflow);
     return error;
-
 }
 
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
 
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
                                          int isNextafter, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
 
 
-    if (gWimpyMode){
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -1036,59 +1378,79 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    test_info.isFDim = 0 == strcmp( "fdim", f->nameInCode );
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
     test_info.skipNanInf = 0;
     test_info.isNextafter = isNextafter;
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
@@ -1101,18 +1463,20 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -1120,300 +1484,346 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-       vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            free_mtdata( test_info.tinfo[i].d );
+            free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    dptr        func = job->f->dfunc;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
 
-    int         isNextafter = job->isNextafter;
-    cl_ulong    *t;
-    cl_double   *r,*s,*s2;
+    int isNextafter = job->isNextafter;
+    cl_ulong *t;
+    cl_double *r, *s, *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_double *fp2 = (cl_double *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesDouble[x];
             fp2[j] = specialValuesDouble[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesDoubleCount )
-                    break;
+                if (y >= specialValuesDoubleCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int64(d);
         p2[j] = genrand_int64(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_ff( s[j], s2[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsDoubleResultSubnormal(correct, ulps ) )
+                    if (IsDoubleResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // nextafter on FTZ platforms may return the smallest
@@ -1426,103 +1836,113 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
                     // normal number is the next representable number.
                     // In which case, it should have the same sign as the
                     // second argument.
-                    if (isNextafter )
+                    if (isNextafter)
                     {
-                        if(IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
+                        if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
                         {
                             cl_double value = copysign(twoToMinus1022, s2[j]);
                             fail = fail && (test != value);
-                            if (!fail)
-                                err = 0.0f;
+                            if (!fail) err = 0.0f;
                         }
                     }
                     else
                     {
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
-                            long double correct2 = func.f_ff( 0.0, s2[j] );
-                            long double correct3 = func.f_ff( -0.0, s2[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 = func.f_ff(0.0, s2[j]);
+                            long double correct3 = func.f_ff(-0.0, s2[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with both args as zero
-                            if( IsDoubleSubnormal( s2[j] )  )
+                            // try with both args as zero
+                            if (IsDoubleSubnormal(s2[j]))
                             {
-                                correct2 = func.f_ff( 0.0, 0.0 );
-                                correct3 = func.f_ff( -0.0, 0.0 );
-                                long double correct4 = func.f_ff( 0.0, -0.0 );
-                                long double correct5 = func.f_ff( -0.0, -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                                 (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = func.f_ff(0.0, 0.0);
+                                correct3 = func.f_ff(-0.0, 0.0);
+                                long double correct4 = func.f_ff(0.0, -0.0);
+                                long double correct5 = func.f_ff(-0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps))
+                                        && (!(fabsf(err4) <= ulps))
+                                        && (!(fabsf(err5) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ||
-                                    IsDoubleResultSubnormal( correct4, ulps ) || IsDoubleResultSubnormal( correct5, ulps ) )
+                                if (IsDoubleResultSubnormal(correct2, ulps)
+                                    || IsDoubleResultSubnormal(correct3, ulps)
+                                    || IsDoubleResultSubnormal(correct4, ulps)
+                                    || IsDoubleResultSubnormal(correct5, ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if(IsDoubleSubnormal(s2[j]) )
+                        else if (IsDoubleSubnormal(s2[j]))
                         {
-                            long double correct2 = func.f_ff( s[j], 0.0 );
-                            long double correct3 = func.f_ff( s[j], -0.0 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 = func.f_ff(s[j], 0.0);
+                            long double correct3 = func.f_ff(s[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%.13la, %.13la}: *%.13la vs. %.13la\n", name, sizeNames[k], err, s[j], s2[j], r[j], test );
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
+                               "%.13la}: *%.13la vs. %.13la\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j],
+                               test);
                     error = -1;
                     goto exit;
                 }
@@ -1530,33 +1950,37 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements,  job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 exit:
     return error;
-
 }
 
 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
@@ -1580,4 +2004,3 @@ int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d,
 {
     return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode);
 }
-
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index abcb1b00..f6ba838a 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -38,63 +38,85 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                        int vectorSize, cl_uint kernel_count, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   out[i] =  in1[i] ", operator_symbol, " in2[i];\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       f0 = f0 ", operator_symbol, " f1;\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = f0 ", operator_symbol, " f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void ",
+                        name,
+                        "_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] =  in1[i] ",
+                        operator_symbol,
+                        " in2[i];\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void ",
+        name,
+        "_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = f0 ",
+        operator_symbol,
+        " f1;\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = f0 ",
+        operator_symbol,
+        " f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "%s_kernel%s", name, sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -104,65 +126,87 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
                              int vectorSize, cl_uint kernel_count, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   out[i] =  in1[i] ", operator_symbol, " in2[i];\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-                            "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       d0 = d0 ", operator_symbol, " d1;\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = d0 ", operator_symbol, " d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void ",
+                        name,
+                        "_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] =  in1[i] ",
+                        operator_symbol,
+                        " in2[i];\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+        "__kernel void ",
+        name,
+        "_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "%s_kernel%s", name, sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -170,114 +214,214 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *name;
-    const char  *operator_symbol;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *name;
+    const char *operator_symbol;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->name, info->operator_symbol, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->name, info->operator_symbol, i,
                              info->kernel_count, info->kernels[i],
                              info->programs + i, info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value (param 1).  Init to 0.
-    double      maxErrorValue2;                     // position of the max error value (param 2).  Init to 0.
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
     bool relaxedMode; // True if the test is being run in relaxed mode, false
                       // otherwise.
 
     // no special fields
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
 
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                                         bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
-    if (gWimpyMode) {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
     test_info.step = test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -287,59 +431,80 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     test_info.relaxedMode = relaxedMode;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -355,18 +520,20 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                                        f->name,
                                        f->nameInCode,
                                        relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -374,110 +541,130 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
             p2[j] = 0x3fc00000;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
@@ -485,12 +672,12 @@ exit:
 
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    fptr        func = job->f->func;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     if (relaxedMode)
@@ -499,74 +686,77 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
 
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    cl_uchar    *overflow = (cl_uchar*)malloc(buffer_size);
-    const char  *name = job->f->name;
-    cl_uint     *t;
-    cl_float    *r,*s,*s2;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
+    const char *name = job->f->name;
+    cl_uint *t;
+    cl_float *r, *s, *s2;
     RoundingMode oldRoundMode;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
 
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
 
-    if( job_id <= (cl_uint)indx ) {
+    if (job_id <= (cl_uint)indx)
+    {
         // Insert special values
         uint32_t x, y;
 
         x = (job_id * buffer_elements) % specialValuesFloatCount;
         y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ ) {
+        for (; j < buffer_elements; j++)
+        {
             p[j] = ((cl_uint *)specialValuesFloat)[x];
             p2[j] = ((cl_uint *)specialValuesFloat)[y];
             ++x;
-            if (x >= specialValuesFloatCount) {
+            if (x >= specialValuesFloatCount)
+            {
                 x = 0;
                 y++;
-                if (y >= specialValuesFloatCount)
-                    break;
+                if (y >= specialValuesFloatCount) break;
             }
             if (relaxedMode && strcmp(name, "divide") == 0)
             {
                 cl_uint pj = p[j] & 0x7fffffff;
                 cl_uint p2j = p2[j] & 0x7fffffff;
                 // Replace values outside [2^-62, 2^62] with QNaN
-                if (pj < 0x20800000 || pj > 0x5e800000)
-                    p[j] = 0x7fc00000;
-                if (p2j < 0x20800000 || p2j > 0x5e800000)
-                    p2[j] = 0x7fc00000;
+                if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000;
+                if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000;
             }
         }
     }
 
     // Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int32(d);
         p2[j] = genrand_int32(d);
@@ -576,316 +766,353 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             cl_uint pj = p[j] & 0x7fffffff;
             cl_uint p2j = p2[j] & 0x7fffffff;
             // Replace values outside [2^-62, 2^62] with QNaN
-            if (pj < 0x20800000 || pj > 0x5e800000)
-                p[j] = 0x7fc00000;
-            if (p2j < 0x20800000 || p2j > 0x5e800000)
-                p2[j] = 0x7fc00000;
+            if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000;
+            if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000;
         }
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
+    if (gSkipCorrectnessTesting)
     {
-        free( overflow );
+        free(overflow);
         return CL_SUCCESS;
     }
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     FPU_mode_type oldMode;
-    memset( &oldMode, 0, sizeof( oldMode ) );
-    if( ftz )
-        ForceFTZ( &oldMode );
+    memset(&oldMode, 0, sizeof(oldMode));
+    if (ftz) ForceFTZ(&oldMode);
 
     // Set the rounding mode to match the device
     oldRoundMode = kRoundToNearestEven;
-    if (gIsInRTZMode)
-        oldRoundMode = set_round(kRoundTowardZero, kfloat);
+    if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
 
-    //Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (float *)gIn2  + thread_id * buffer_elements;
-    if( gInfNanSupport )
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    if (gInfNanSupport)
     {
-        for( j = 0; j < buffer_elements; j++ )
-            r[j] = (float) func.f_ff( s[j], s2[j] );
+        for (j = 0; j < buffer_elements; j++)
+            r[j] = (float)func.f_ff(s[j], s2[j]);
     }
     else
     {
-        for( j = 0; j < buffer_elements; j++ )
+        for (j = 0; j < buffer_elements; j++)
         {
             feclearexcept(FE_OVERFLOW);
-            r[j] = (float) func.f_ff( s[j], s2[j] );
-            overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            r[j] = (float)func.f_ff(s[j], s2[j]);
+            overflow[j] =
+                FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
         }
     }
 
-    if (gIsInRTZMode)
-      (void)set_round(oldRoundMode, kfloat);
+    if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
 
-    if( ftz )
-        RestoreFPState( &oldMode );
+    if (ftz) RestoreFPState(&oldMode);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_uint *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_uint *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                float test = ((float*) q)[j];
-                double correct = func.f_ff( s[j], s2[j] );
+                float test = ((float *)q)[j];
+                double correct = func.f_ff(s[j], s2[j]);
 
-                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                if ( !gInfNanSupport)
+                // Per section 10 paragraph 6, accept any result if an input or
+                // output is a infinity or NaN or overflow
+                if (!gInfNanSupport)
                 {
-                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                    if( overflow[j]                                         ||
-                        IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                        IsFloatInfinity(s2[j])   || IsFloatNaN(s2[j])       ||
-                        IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        )
+                    // Note: no double rounding here.  Reference functions
+                    // calculate in single precision.
+                    if (overflow[j] || IsFloatInfinity(correct)
+                        || IsFloatNaN(correct) || IsFloatInfinity(s2[j])
+                        || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j])
+                        || IsFloatNaN(s[j]))
                         continue;
                 }
 
-        // Per section 10 paragraph 6, accept embedded devices always returning positive 0.0.
-        if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000)) continue;
+                // Per section 10 paragraph 6, accept embedded devices always
+                // returning positive 0.0.
+                if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000))
+                    continue;
 
-                float err = Ulp_Error( test, correct );
-                float errB = Ulp_Error( test, (float) correct  );
+                float err = Ulp_Error(test, correct);
+                float errB = Ulp_Error(test, (float)correct);
 
-                int fail = ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
-                if( fabsf( errB ) < fabsf(err ) )
-                  err = errB;
+                int fail =
+                    ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
+                if (fabsf(errB) < fabsf(err)) err = errB;
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsFloatResultSubnormal(correct, ulps ) )
+                    if (IsFloatResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // retry per section 6.5.3.3
-                    if( IsFloatSubnormal( s[j] ) )
+                    if (IsFloatSubnormal(s[j]))
                     {
                         double correct2, correct3;
                         float err2, err3;
 
-                        if( !gInfNanSupport )
-                            feclearexcept(FE_OVERFLOW);
+                        if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
 
-                        correct2 = func.f_ff( 0.0, s2[j] );
-                        correct3 = func.f_ff( -0.0, s2[j] );
+                        correct2 = func.f_ff(0.0, s2[j]);
+                        correct3 = func.f_ff(-0.0, s2[j]);
 
-                        // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                        if( !gInfNanSupport )
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
                         {
-                            if( fetestexcept(FE_OVERFLOW) )
-                                continue;
+                            if (fetestexcept(FE_OVERFLOW)) continue;
 
-                            // Note: no double rounding here.  Reference functions calculate in single precision.
-                            if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                IsFloatInfinity(correct3) || IsFloatNaN(correct3)    )
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2)
+                                || IsFloatInfinity(correct3)
+                                || IsFloatNaN(correct3))
                                 continue;
                         }
 
-                        err2 = Ulp_Error( test, correct2  );
-                        err3 = Ulp_Error( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
-                        //try with both args as zero
-                        if( IsFloatSubnormal( s2[j] )  )
+                        // try with both args as zero
+                        if (IsFloatSubnormal(s2[j]))
                         {
                             double correct4, correct5;
                             float err4, err5;
 
-                            if( !gInfNanSupport )
-                                feclearexcept(FE_OVERFLOW);
+                            if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = func.f_ff( 0.0, 0.0 );
-                            correct3 = func.f_ff( -0.0, 0.0 );
-                            correct4 = func.f_ff( 0.0, -0.0 );
-                            correct5 = func.f_ff( -0.0, -0.0 );
+                            correct2 = func.f_ff(0.0, 0.0);
+                            correct3 = func.f_ff(-0.0, 0.0);
+                            correct4 = func.f_ff(0.0, -0.0);
+                            correct5 = func.f_ff(-0.0, -0.0);
 
-                            // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                            if( !gInfNanSupport )
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (!gInfNanSupport)
                             {
-                                if( fetestexcept(FE_OVERFLOW) )
-                                    continue;
+                                if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2)   ||
-                                    IsFloatInfinity(correct3) || IsFloatNaN(correct3)   ||
-                                    IsFloatInfinity(correct4) || IsFloatNaN(correct4)   ||
-                                    IsFloatInfinity(correct5) || IsFloatNaN(correct5)    )
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3)
+                                    || IsFloatInfinity(correct4)
+                                    || IsFloatNaN(correct4)
+                                    || IsFloatInfinity(correct5)
+                                    || IsFloatNaN(correct5))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            err4 = Ulp_Error( test, correct4  );
-                            err5 = Ulp_Error( test, correct5  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                             (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( fabsf( err4 ) < fabsf(err ) )
-                                err = err4;
-                            if( fabsf( err5 ) < fabsf(err ) )
-                                err = err5;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            err4 = Ulp_Error(test, correct4);
+                            err5 = Ulp_Error(test, correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
 
                             // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ||
-                                IsFloatResultSubnormal( correct4, ulps ) || IsFloatResultSubnormal( correct5, ulps ) )
+                            if (IsFloatResultSubnormal(correct2, ulps)
+                                || IsFloatResultSubnormal(correct3, ulps)
+                                || IsFloatResultSubnormal(correct4, ulps)
+                                || IsFloatResultSubnormal(correct5, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    else if(IsFloatSubnormal(s2[j]) )
+                    else if (IsFloatSubnormal(s2[j]))
                     {
                         double correct2, correct3;
                         float err2, err3;
 
-                        if( !gInfNanSupport )
-                            feclearexcept(FE_OVERFLOW);
+                        if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
 
-                        correct2 = func.f_ff( s[j], 0.0 );
-                        correct3 = func.f_ff( s[j], -0.0 );
+                        correct2 = func.f_ff(s[j], 0.0);
+                        correct3 = func.f_ff(s[j], -0.0);
 
-                        // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                        if ( !gInfNanSupport)
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
                         {
-                            // Note: no double rounding here.  Reference functions calculate in single precision.
-                            if( overflow[j]                                         ||
-                                IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                                IsFloatInfinity(correct2)|| IsFloatNaN(correct2)    )
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (overflow[j] || IsFloatInfinity(correct)
+                                || IsFloatNaN(correct)
+                                || IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2))
                                 continue;
                         }
 
-                        err2 = Ulp_Error( test, correct2  );
-                        err3 = Ulp_Error( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a (0x%8.8x) at index: %d\n", name, sizeNames[k], err, s[j], s2[j], r[j], test, ((cl_uint*)&test)[0], j );
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
+                               "vs. %a (0x%8.8x) at index: %d\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j], test,
+                               ((cl_uint *)&test)[0], j);
                     error = -1;
                     goto exit;
                 }
@@ -893,85 +1120,185 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step,  job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 exit:
-    if( overflow )
-        free( overflow );
+    if (overflow) free(overflow);
     return error;
 }
 
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
 
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                                            bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    double      maxErrorVal2 = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -983,56 +1310,76 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -1049,18 +1396,20 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
                                        f->name,
                                        f->nameInCode,
                                        relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -1068,387 +1417,441 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    dptr        func = job->f->dfunc;
-    int         ftz = job->ftz;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_ulong    *t;
-    cl_double   *r,*s,*s2;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ulong *t;
+    cl_double *r, *s, *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_double *fp2 = (cl_double *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesDouble[x];
             fp2[j] = specialValuesDouble[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesDoubleCount )
-                    break;
+                if (y >= specialValuesDoubleCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int64(d);
         p2[j] = genrand_int64(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_ff( s[j], s2[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsDoubleResultSubnormal(correct, ulps ) )
+                    if (IsDoubleResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
 
                     // retry per section 6.5.3.3
-                    if( IsDoubleSubnormal( s[j] ) )
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        long double correct2 = func.f_ff( 0.0, s2[j] );
-                        long double correct3 = func.f_ff( -0.0, s2[j] );
-                        float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                        float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        long double correct2 = func.f_ff(0.0, s2[j]);
+                        long double correct3 = func.f_ff(-0.0, s2[j]);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
-                        //try with both args as zero
-                        if( IsDoubleSubnormal( s2[j] )  )
+                        // try with both args as zero
+                        if (IsDoubleSubnormal(s2[j]))
                         {
-                            correct2 = func.f_ff( 0.0, 0.0 );
-                            correct3 = func.f_ff( -0.0, 0.0 );
-                            long double correct4 = func.f_ff( 0.0, -0.0 );
-                            long double correct5 = func.f_ff( -0.0, -0.0 );
-                            err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) &&
-                                             (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( fabsf( err4 ) < fabsf(err ) )
-                                err = err4;
-                            if( fabsf( err5 ) < fabsf(err ) )
-                                err = err5;
+                            correct2 = func.f_ff(0.0, 0.0);
+                            correct3 = func.f_ff(-0.0, 0.0);
+                            long double correct4 = func.f_ff(0.0, -0.0);
+                            long double correct5 = func.f_ff(-0.0, -0.0);
+                            err2 = Bruteforce_Ulp_Error_Double(test, correct2);
+                            err3 = Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err4 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            float err5 =
+                                Bruteforce_Ulp_Error_Double(test, correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ||
-                                IsDoubleResultSubnormal( correct4, ulps ) || IsDoubleResultSubnormal( correct5, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps)
+                                || IsDoubleResultSubnormal(correct4, ulps)
+                                || IsDoubleResultSubnormal(correct5, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    else if(IsDoubleSubnormal(s2[j]) )
+                    else if (IsDoubleSubnormal(s2[j]))
                     {
-                        long double correct2 = func.f_ff( s[j], 0.0 );
-                        long double correct3 = func.f_ff( s[j], -0.0 );
-                        float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                        float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        long double correct2 = func.f_ff(s[j], 0.0);
+                        long double correct3 = func.f_ff(s[j], -0.0);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n", name, sizeNames[k], err, s[j], s2[j], r[j], test );
+                    vlog_error(
+                        "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
+                        name, sizeNames[k], err, s[j], s2[j], r[j], test);
                     error = -1;
                     goto exit;
                 }
@@ -1456,36 +1859,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements,  job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 
 exit:
     return error;
-
 }
-
-
-
-
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index 01f45242..dc6feb8c 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -34,64 +34,83 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global int", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global int",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global int* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
-                            "       f0 = ", name, "( f0, i0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0;\n"
-                            "       int3 i0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global int* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       int3 i0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -101,66 +120,85 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
 {
-    const char *c[] = {     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global int", sizeNames[vectorSize], "* in2 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i] );\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global int",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global int* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
-                            "       d0 = ", name, "( d0, i0 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0;\n"
-                            "       int3 i0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = ", name, "( d0, i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global int* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, i0 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       int3 i0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -168,27 +206,31 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
@@ -198,85 +240,185 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
 
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 
-static const int specialValuesInt[] = { 0, 1, 2, 3, 126, 127, 128, 0x02000001, 0x04000001, 1465264071, 1488522147,
-                                            -1, -2, -3, -126, -127, -128, -0x02000001, -0x04000001, -1465264071, -1488522147 };
-static size_t specialValuesIntCount = sizeof( specialValuesInt ) / sizeof( specialValuesInt[0] );
+static const int specialValuesInt[] = {
+    0,           1,           2,          3,          126,        127,
+    128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
+    -2,          -3,          -126,       -127,       -128,       -0x02000001,
+    -0x04000001, -1465264071, -1488522147
+};
+static size_t specialValuesIntCount =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value (param 1).  Init to 0.
-    cl_int      maxErrorValue2;                     // position of the max error value (param 2).  Init to 0.
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
 
     // no special values
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    cl_int      maxErrorVal2 = 0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
 
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -286,59 +428,82 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        cl_buffer_region region2 = { i * test_info.subBufferSize * sizeof( cl_int), test_info.subBufferSize * sizeof( cl_int) };
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
@@ -350,18 +515,20 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
     // Run the kernels
-    error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+    error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
 
     // Accumulate the arithmetic errors
-    for( i = 0; i < test_info.threadCount; i++ )
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        if( test_info.tinfo[i].maxError > maxError )
+        if (test_info.tinfo[i].maxError > maxError)
         {
             maxError = test_info.tinfo[i].maxError;
             maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -369,331 +536,377 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         }
     }
 
-    if( error )
-        goto exit;
+    if (error) goto exit;
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000;
             p2[j] = 3;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    fptr        func = job->f->func;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_uint     *t;
-    cl_float    *r,*s;
-    cl_int      *s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_uint *t;
+    cl_float *r, *s;
+    cl_int *s2;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesIntCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesIntCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         float *fp = (float *)p;
         cl_int *ip2 = (cl_int *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesFloatCount;
-    y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
-          fp[j] = specialValuesFloat[x];
-          ip2[j] = specialValuesInt[y];
-            if( ++x >= specialValuesFloatCount )
+            fp[j] = specialValuesFloat[x];
+            ip2[j] = specialValuesInt[y];
+            if (++x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesIntCount )
-                    break;
+                if (y >= specialValuesIntCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
-      p[j] = genrand_int32(d);
-      p2[j] = genrand_int32(d);
+        p[j] = genrand_int32(d);
+        p2[j] = genrand_int32(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_int *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (float) func.f_fi( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_uint *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_uint *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                float test = ((float*) q)[j];
-                double correct = func.f_fi( s[j], s2[j] );
-                float err = Ulp_Error( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                float test = ((float *)q)[j];
+                double correct = func.f_fi(s[j], s2[j]);
+                float err = Ulp_Error(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsFloatResultSubnormal(correct, ulps ) )
+                    if (IsFloatResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // retry per section 6.5.3.3
-                    if( IsFloatSubnormal( s[j] ) )
+                    if (IsFloatSubnormal(s[j]))
                     {
                         double correct2, correct3;
                         float err2, err3;
-                        correct2 = func.f_fi( 0.0, s2[j] );
-                        correct3 = func.f_fi( -0.0, s2[j] );
-                        err2 = Ulp_Error( test, correct2  );
-                        err3 = Ulp_Error( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        correct2 = func.f_fi(0.0, s2[j]);
+                        correct3 = func.f_fi(-0.0, s2[j]);
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) )
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
                     vlog_error(
                         "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
@@ -708,89 +921,191 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step,  job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
 
 exit:
     return error;
-
 }
 
 
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
 
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-static const int specialValuesInt2[] = { 0, 1, 2, 3, 1022, 1023, 1024, INT_MIN, INT_MAX,
-                                            -1, -2, -3, -1022, -1023, -11024, -INT_MAX };
-static size_t specialValuesInt2Count = sizeof( specialValuesInt ) / sizeof( specialValuesInt[0] );
+static const int specialValuesInt2[] = { 0,       1,     2,      3,
+                                         1022,    1023,  1024,   INT_MIN,
+                                         INT_MAX, -1,    -2,     -3,
+                                         -1022,   -1023, -11024, -INT_MAX };
+static size_t specialValuesInt2Count =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-    cl_int      maxErrorVal2 = 0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -802,59 +1117,82 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        cl_buffer_region region2 = { i * test_info.subBufferSize * sizeof( cl_int), test_info.subBufferSize * sizeof( cl_int) };
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -868,19 +1206,21 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
     // Run the kernels
-    if( !gSkipCorrectnessTesting )
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+    if (!gSkipCorrectnessTesting)
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
 
     // Accumulate the arithmetic errors
-    for( i = 0; i < test_info.threadCount; i++ )
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        if( test_info.tinfo[i].maxError > maxError )
+        if (test_info.tinfo[i].maxError > maxError)
         {
             maxError = test_info.tinfo[i].maxError;
             maxErrorVal = test_info.tinfo[i].maxErrorValue;
@@ -888,334 +1228,386 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         }
     }
 
-    if( error )
-        goto exit;
+    if (error) goto exit;
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         double *p = (double *)gIn;
         cl_int *p2 = (cl_int *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = 3;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE/2, gIn2, 0, NULL, NULL) ))
+        if ((error =
+                 clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                      BUFFER_SIZE / 2, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
 
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    float       ulps = job->ulps;
-    dptr        func = job->f->dfunc;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_ulong    *t;
-    cl_double   *r,*s;
-    cl_int      *s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ulong *t;
+    cl_double *r, *s;
+    cl_int *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
     cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesInt2Count;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesInt2Count;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
         cl_int *ip2 = (cl_int *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesDouble[x];
             ip2[j] = specialValuesInt2[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesInt2Count )
-                    break;
+                if (y >= specialValuesInt2Count) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = DoubleFromUInt32(genrand_int32(d));
         p2[j] = genrand_int32(d);
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size/2, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size / 2, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_int *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_fi( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_fi(s[j], s2[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_fi( s[j], s2[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_fi(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail && ftz )
+                if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if( IsDoubleResultSubnormal(correct, ulps ) )
+                    if (IsDoubleResultSubnormal(correct, ulps))
                     {
-                        fail = fail && ( test != 0.0f );
-                        if( ! fail )
-                            err = 0.0f;
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
                     }
 
                     // retry per section 6.5.3.3
-                    if( IsDoubleSubnormal( s[j] ) )
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        long double correct2 = func.f_fi( 0.0, s2[j] );
-                        long double correct3 = func.f_fi( -0.0, s2[j] );
-                        float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                        float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                        fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                        if( fabsf( err2 ) < fabsf(err ) )
-                            err = err2;
-                        if( fabsf( err3 ) < fabsf(err ) )
-                            err = err3;
+                        long double correct2 = func.f_fi(0.0, s2[j]);
+                        long double correct3 = func.f_fi(-0.0, s2[j]);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) )
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
                         {
-                            fail = fail && ( test != 0.0f);
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
                     }
                 }
 
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                     tinfo->maxErrorValue2 = s2[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at {%.13la, %d}: *%.13la vs. %.13la\n", name, sizeNames[k], err, s[j], s2[j], r[j], test );
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, %d}: "
+                               "*%.13la vs. %.13la\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j],
+                               test);
                     error = -1;
                     goto exit;
                 }
@@ -1223,35 +1615,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-          vlog("." );
-       }
-       fflush(stdout);
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
 exit:
     return error;
-
 }
-
-
-
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index af1b04d1..5065b280 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -36,68 +36,90 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], out2 + i );\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in, __global float* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       f0 = ", name, "( f0, f1, &i0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( i0, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       f0 = ", name, "( f0, f1, &i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in, "
+        "__global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -106,95 +128,121 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], out2 + i );\n"
-                            "}\n"
-                        };
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in, __global double* in2)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       d0 = ", name, "( d0, d1, &i0 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "       vstore3( i0, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       int3 i0 = 0xdeaddead;\n"
-                            "       d0 = ", name, "( d0, d1, &i0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               out2[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               out2[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global int* out2, __global double* in, "
+        "__global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -207,7 +255,7 @@ typedef struct ComputeReferenceInfoF_
     const float *y;
     float *r;
     int *i;
-    double (*f_ffpI)(double, double, int*);
+    double (*f_ffpI)(double, double, int *);
     cl_uint lim;
     cl_uint count;
 } ComputeReferenceInfoF;
@@ -218,13 +266,12 @@ typedef struct ComputeReferenceInfoD_
     const double *y;
     double *r;
     int *i;
-    long double (*f_ffpI)(long double, long double, int*);
+    long double (*f_ffpI)(long double, long double, int *);
     cl_uint lim;
     cl_uint count;
 } ComputeReferenceInfoD;
 
-static cl_int
-ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
     cl_uint lim = cri->lim;
@@ -237,17 +284,15 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
     double (*f)(double, double, int *) = cri->f_ffpI;
     cl_uint j;
 
-    if (off + count > lim)
-    count = lim - off;
+    if (off + count > lim) count = lim - off;
 
     for (j = 0; j < count; ++j)
-    r[j] = (float)f((double)x[j], (double)y[j], i + j);
+        r[j] = (float)f((double)x[j], (double)y[j], i + j);
 
     return CL_SUCCESS;
 }
 
-static cl_int
-ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
     cl_uint lim = cri->lim;
@@ -260,13 +305,12 @@ ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     long double (*f)(long double, long double, int *) = cri->f_ffpI;
     cl_uint j;
 
-    if (off + count > lim)
-    count = lim - off;
+    if (off + count > lim) count = lim - off;
 
     Force64BitFPUPrecision();
 
     for (j = 0; j < count; ++j)
-    r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
+        r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
 
     return CL_SUCCESS;
 }
@@ -278,15 +322,15 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     float float_ulps;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
 #if defined PARALLEL_REFERENCE
@@ -294,7 +338,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 #endif
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded )
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -305,392 +349,480 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         cl_uint *p = (cl_uint *)gIn;
         cl_uint *p2 = (cl_uint *)gIn2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
         // Calculate the correctly rounded reference result
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
 
 #if defined PARALLEL_REFERENCE
-    if (threadCount > 1) {
-        ComputeReferenceInfoF cri;
-        cri.x = s;
-        cri.y = s2;
-        cri.r = (float *)gOut_Ref;
-        cri.i = (int *)gOut_Ref2;
-        cri.f_ffpI = f->func.f_ffpI;
-        cri.lim = bufferSize / sizeof( float );
-        cri.count = (cri.lim + threadCount - 1) / threadCount;
-        ThreadPool_Do(ReferenceF, threadCount, &cri);
-    } else {
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoF cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (float *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->func.f_ffpI;
+            cri.lim = bufferSize / sizeof(float);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceF, threadCount, &cri);
+        }
+        else
+        {
 #endif
             float *r = (float *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                r[j] = (float) f->func.f_ffpI( s[j], s2[j], r2+j );
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
 #if defined PARALLEL_REFERENCE
-    }
+        }
 #endif
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)gOut[k];
                 int32_t *q2 = (int32_t *)gOut2[k];
 
                 // Check for exact match to correctly rounded result
-        if (t[j] == q[j] && t2[j] == q2[j])
-            continue;
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
 
-        // Check for paired NaNs
-        if ((t[j] & 0x7fffffff) > 0x7f800000 && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
-            continue;
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffff) > 0x7f800000
+                    && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
+                    continue;
 
                 // if( t[j] != q[j] || t2[j] != q2[j] )
                 {
-                    float test = ((float*) q)[j];
+                    float test = ((float *)q)[j];
                     int correct2 = INT_MIN;
-                    double correct = f->func.f_ffpI( s[j], s2[j], &correct2 );
-                    float err = Ulp_Error( test, correct );
+                    double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
+                    float err = Ulp_Error(test, correct);
                     int64_t iErr;
 
-                    // in case of remquo, we only care about the sign and last seven bits of
-                    // integer as per the spec.
-                    if(testingRemquo)
-                        iErr = (long long) (q2[j] & 0x0000007f) - (long long) (correct2 & 0x0000007f);
+                    // in case of remquo, we only care about the sign and last
+                    // seven bits of integer as per the spec.
+                    if (testingRemquo)
+                        iErr = (long long)(q2[j] & 0x0000007f)
+                            - (long long)(correct2 & 0x0000007f);
                     else
-                        iErr = (long long) q2[j] - (long long) correct2;
+                        iErr = (long long)q2[j] - (long long)correct2;
 
-                    //For remquo, if y = 0, x is infinite, or either is NaN then the standard either neglects
-                    //to say what is returned in iptr or leaves it undefined or implementation defined.
-                    int iptrUndefined = fabs(((float*) gIn)[j]) == INFINITY ||
-                                        ((float*) gIn2)[j] == 0.0f          ||
-                                        isnan(((float*) gIn2)[j])           ||
-                                        isnan(((float*) gIn)[j]);
-                    if(iptrUndefined)
-                         iErr = 0;
+                    // For remquo, if y = 0, x is infinite, or either is NaN
+                    // then the standard either neglects to say what is returned
+                    // in iptr or leaves it undefined or implementation defined.
+                    int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
+                        || ((float *)gIn2)[j] == 0.0f
+                        || isnan(((float *)gIn2)[j])
+                        || isnan(((float *)gIn)[j]);
+                    if (iptrUndefined) iErr = 0;
 
-                    int fail = ! (fabsf(err) <= float_ulps && iErr == 0 );
-                    if( ftz && fail )
+                    int fail = !(fabsf(err) <= float_ulps && iErr == 0);
+                    if (ftz && fail)
                     {
                         // retry per section 6.5.3.2
-                        if( IsFloatResultSubnormal(correct, float_ulps ) )
+                        if (IsFloatResultSubnormal(correct, float_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
                             int correct3i, correct4i;
-                            double correct3 = f->func.f_ffpI( 0.0, s2[j], &correct3i );
-                            double correct4 = f->func.f_ffpI( -0.0, s2[j], &correct4i );
-                            float err2 = Ulp_Error( test, correct3  );
-                            float err3 = Ulp_Error( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            double correct3 =
+                                f->func.f_ffpI(0.0, s2[j], &correct3i);
+                            double correct4 =
+                                f->func.f_ffpI(-0.0, s2[j], &correct4i);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= float_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps ) )
+                            if (IsFloatResultSubnormal(correct2, float_ulps)
+                                || IsFloatResultSubnormal(correct3, float_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with both args as zero
-                            if( IsFloatSubnormal( s2[j] ) )
+                            // try with both args as zero
+                            if (IsFloatSubnormal(s2[j]))
                             {
                                 int correct7i, correct8i;
-                                correct3 = f->func.f_ffpI( 0.0, 0.0, &correct3i );
-                                correct4 = f->func.f_ffpI( -0.0, 0.0, &correct4i );
-                                double correct7 = f->func.f_ffpI( 0.0, -0.0, &correct7i );
-                                double correct8 = f->func.f_ffpI( -0.0, -0.0, &correct8i );
-                                err2 = Ulp_Error( test, correct3  );
-                                err3 = Ulp_Error( test, correct4  );
-                                float err4 = Ulp_Error( test, correct7  );
-                                float err5 = Ulp_Error( test, correct8  );
-                                iErr3 = (long long) q2[j] - (long long) correct3i;
-                                iErr4 = (long long) q2[j] - (long long) correct4i;
-                                int64_t iErr7 = (long long) q2[j] - (long long) correct7i;
-                                int64_t iErr8 = (long long) q2[j] - (long long) correct8i;
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps  && iErr4 == 0)) &&
-                                                 (!(fabsf(err4) <= float_ulps  && iErr7 == 0)) && (!(fabsf(err5) <= float_ulps  && iErr8 == 0)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-                                if( llabs(iErr3) < llabs( iErr ) )
-                                    iErr = iErr3;
-                                if( llabs(iErr4) < llabs( iErr ) )
-                                    iErr = iErr4;
-                                if( llabs(iErr7) < llabs( iErr ) )
-                                    iErr = iErr7;
-                                if( llabs(iErr8) < llabs( iErr ) )
-                                    iErr = iErr8;
+                                correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                                correct4 =
+                                    f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                                double correct7 =
+                                    f->func.f_ffpI(0.0, -0.0, &correct7i);
+                                double correct8 =
+                                    f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                                err2 = Ulp_Error(test, correct3);
+                                err3 = Ulp_Error(test, correct4);
+                                float err4 = Ulp_Error(test, correct7);
+                                float err5 = Ulp_Error(test, correct8);
+                                iErr3 = (long long)q2[j] - (long long)correct3i;
+                                iErr4 = (long long)q2[j] - (long long)correct4i;
+                                int64_t iErr7 =
+                                    (long long)q2[j] - (long long)correct7i;
+                                int64_t iErr8 =
+                                    (long long)q2[j] - (long long)correct8i;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps
+                                           && iErr3 == 0))
+                                        && (!(fabsf(err3) <= float_ulps
+                                              && iErr4 == 0))
+                                        && (!(fabsf(err4) <= float_ulps
+                                              && iErr7 == 0))
+                                        && (!(fabsf(err5) <= float_ulps
+                                              && iErr8 == 0)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+                                if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                                if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                                if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                                if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
 
                                 // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct3, float_ulps ) || IsFloatResultSubnormal(correct4, float_ulps )  ||
-                                    IsFloatResultSubnormal(correct7, float_ulps ) || IsFloatResultSubnormal(correct8, float_ulps ) )
+                                if (IsFloatResultSubnormal(correct3, float_ulps)
+                                    || IsFloatResultSubnormal(correct4,
+                                                              float_ulps)
+                                    || IsFloatResultSubnormal(correct7,
+                                                              float_ulps)
+                                    || IsFloatResultSubnormal(correct8,
+                                                              float_ulps))
                                 {
-                                    fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0 || iErr7 == 0 || iErr8 == 0));
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && (iErr3 == 0 || iErr4 == 0
+                                                 || iErr7 == 0 || iErr8 == 0));
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( IsFloatSubnormal( s2[j] ) )
+                        else if (IsFloatSubnormal(s2[j]))
                         {
                             int correct3i, correct4i;
-                            double correct3 = f->func.f_ffpI( s[j], 0.0, &correct3i );
-                            double correct4 = f->func.f_ffpI( s[j], -0.0, &correct4i );
-                            float err2 = Ulp_Error( test, correct3  );
-                            float err3 = Ulp_Error( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            double correct3 =
+                                f->func.f_ffpI(s[j], 0.0, &correct3i);
+                            double correct4 =
+                                f->func.f_ffpI(s[j], -0.0, &correct4i);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= float_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps ) )
+                            if (IsFloatResultSubnormal(correct2, float_ulps)
+                                || IsFloatResultSubnormal(correct3, float_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} ({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, 0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
-                                    f->name, sizeNames[k], err, iErr,
-                                   ((float*) gIn)[j], ((float*) gIn2)[j],
-                                   ((cl_uint*) gIn)[j], ((cl_uint*) gIn2)[j],
-                                   ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j],
-                                   ((cl_uint*) gOut_Ref)[j], ((cl_uint*) gOut_Ref2)[j],
-                                   test, q2[j],
-                                   ((cl_uint*)&test)[0], ((cl_uint*) q2)[j] );
-                      error = -1;
-                      goto exit;
+                        vlog_error(
+                            "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
+                            "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
+                            "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
+                            f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
+                            ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
+                            ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
+                            ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
+                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                            ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
-
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -704,14 +836,14 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
@@ -728,400 +860,504 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                        f->nameInCode, relaxedMode };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                    &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
         {
             return error;
         }
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
 
 #if defined PARALLEL_REFERENCE
-    if (threadCount > 1) {
-        ComputeReferenceInfoD cri;
-        cri.x = s;
-        cri.y = s2;
-        cri.r = (double *)gOut_Ref;
-        cri.i = (int *)gOut_Ref2;
-        cri.f_ffpI = f->dfunc.f_ffpI;
-        cri.lim = bufferSize / sizeof( double );
-        cri.count = (cri.lim + threadCount - 1) / threadCount;
-        ThreadPool_Do(ReferenceD, threadCount, &cri);
-    } else {
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoD cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (double *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->dfunc.f_ffpI;
+            cri.lim = bufferSize / sizeof(double);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceD, threadCount, &cri);
+        }
+        else
+        {
 #endif
             double *r = (double *)gOut_Ref;
             int *r2 = (int *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-                r[j] = (double) f->dfunc.f_ffpI( s[j], s2[j], r2+j );
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
 #if defined PARALLEL_REFERENCE
-    }
+        }
 #endif
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)gOut[k];
                 int32_t *q2 = (int32_t *)gOut2[k];
 
-        // Check for exact match to correctly rounded result
-        if (t[j] == q[j] && t2[j] == q2[j])
-            continue;
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
 
-        // Check for paired NaNs
-        if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL &&
-            (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL &&
-            t2[j] == q2[j])
-            continue;
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && t2[j] == q2[j])
+                    continue;
 
                 // if( t[j] != q[j] || t2[j] != q2[j] )
                 {
-                    double test = ((double*) q)[j];
+                    double test = ((double *)q)[j];
                     int correct2 = INT_MIN;
-                    long double correct = f->dfunc.f_ffpI( s[j], s2[j], &correct2 );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
+                    long double correct =
+                        f->dfunc.f_ffpI(s[j], s2[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
                     int64_t iErr;
 
-                    // in case of remquo, we only care about the sign and last seven bits of
-                    // integer as per the spec.
-                    if(testingRemquo)
-                        iErr = (long long) (q2[j] & 0x0000007f) - (long long) (correct2 & 0x0000007f);
+                    // in case of remquo, we only care about the sign and last
+                    // seven bits of integer as per the spec.
+                    if (testingRemquo)
+                        iErr = (long long)(q2[j] & 0x0000007f)
+                            - (long long)(correct2 & 0x0000007f);
                     else
-                        iErr = (long long) q2[j] - (long long) correct2;
+                        iErr = (long long)q2[j] - (long long)correct2;
 
-                    //For remquo, if y = 0, x is infinite, or either is NaN then the standard either neglects
-                    //to say what is returned in iptr or leaves it undefined or implementation defined.
-                    int iptrUndefined = fabs(((double*) gIn)[j]) == INFINITY ||
-                                        ((double*) gIn2)[j] == 0.0          ||
-                                        isnan(((double*) gIn2)[j])           ||
-                                        isnan(((double*) gIn)[j]);
-                    if(iptrUndefined)
-                         iErr = 0;
+                    // For remquo, if y = 0, x is infinite, or either is NaN
+                    // then the standard either neglects to say what is returned
+                    // in iptr or leaves it undefined or implementation defined.
+                    int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
+                        || ((double *)gIn2)[j] == 0.0
+                        || isnan(((double *)gIn2)[j])
+                        || isnan(((double *)gIn)[j]);
+                    if (iptrUndefined) iErr = 0;
 
-                    int fail = ! (fabsf(err) <= f->double_ulps && iErr == 0 );
-                    if( ftz && fail )
+                    int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
+                    if (ftz && fail)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
                             int correct3i, correct4i;
-                            long double correct3 = f->dfunc.f_ffpI( 0.0, s2[j], &correct3i );
-                            long double correct4 = f->dfunc.f_ffpI( -0.0, s2[j], &correct4i );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            long double correct3 =
+                                f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
+                            long double correct4 =
+                                f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps
+                                       && iErr3 == 0))
+                                    && (!(fabsf(err3) <= f->double_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with both args as zero
-                            if( IsDoubleSubnormal( s2[j] ) )
+                            // try with both args as zero
+                            if (IsDoubleSubnormal(s2[j]))
                             {
                                 int correct7i, correct8i;
-                                correct3 = f->dfunc.f_ffpI( 0.0, 0.0, &correct3i );
-                                correct4 = f->dfunc.f_ffpI( -0.0, 0.0, &correct4i );
-                                long double correct7 = f->dfunc.f_ffpI( 0.0, -0.0, &correct7i );
-                                long double correct8 = f->dfunc.f_ffpI( -0.0, -0.0, &correct8i );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct7  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct8  );
-                                iErr3 = (long long) q2[j] - (long long) correct3i;
-                                iErr4 = (long long) q2[j] - (long long) correct4i;
-                                int64_t iErr7 = (long long) q2[j] - (long long) correct7i;
-                                int64_t iErr8 = (long long) q2[j] - (long long) correct8i;
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps  && iErr4 == 0)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps  && iErr7 == 0)) && (!(fabsf(err5) <= f->double_ulps  && iErr8 == 0)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-                                if( llabs(iErr3) < llabs( iErr ) )
-                                    iErr = iErr3;
-                                if( llabs(iErr4) < llabs( iErr ) )
-                                    iErr = iErr4;
-                                if( llabs(iErr7) < llabs( iErr ) )
-                                    iErr = iErr7;
-                                if( llabs(iErr8) < llabs( iErr ) )
-                                    iErr = iErr8;
+                                correct3 =
+                                    f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
+                                correct4 =
+                                    f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
+                                long double correct7 =
+                                    f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
+                                long double correct8 =
+                                    f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct7);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct8);
+                                iErr3 = (long long)q2[j] - (long long)correct3i;
+                                iErr4 = (long long)q2[j] - (long long)correct4i;
+                                int64_t iErr7 =
+                                    (long long)q2[j] - (long long)correct7i;
+                                int64_t iErr8 =
+                                    (long long)q2[j] - (long long)correct8i;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps
+                                           && iErr3 == 0))
+                                        && (!(fabsf(err3) <= f->double_ulps
+                                              && iErr4 == 0))
+                                        && (!(fabsf(err4) <= f->double_ulps
+                                              && iErr7 == 0))
+                                        && (!(fabsf(err5) <= f->double_ulps
+                                              && iErr8 == 0)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+                                if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                                if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                                if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                                if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct3, f->double_ulps ) || IsDoubleResultSubnormal( correct4, f->double_ulps )  ||
-                                    IsDoubleResultSubnormal( correct7, f->double_ulps ) || IsDoubleResultSubnormal( correct8, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct3,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct7,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct8,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0 || iErr7 == 0 || iErr8 == 0));
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && (iErr3 == 0 || iErr4 == 0
+                                                 || iErr7 == 0 || iErr8 == 0));
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( IsDoubleSubnormal( s2[j] ) )
+                        else if (IsDoubleSubnormal(s2[j]))
                         {
                             int correct3i, correct4i;
-                            long double correct3 = f->dfunc.f_ffpI( s[j], 0.0, &correct3i );
-                            long double correct4 = f->dfunc.f_ffpI( s[j], -0.0, &correct4i );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            int64_t iErr3 = (long long) q2[j] - (long long) correct3i;
-                            int64_t iErr4 = (long long) q2[j] - (long long) correct4i;
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps && iErr4 == 0)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-                            if( llabs(iErr3) < llabs( iErr ) )
-                                iErr = iErr3;
-                            if( llabs(iErr4) < llabs( iErr ) )
-                                iErr = iErr4;
+                            long double correct3 =
+                                f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
+                            long double correct4 =
+                                f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            int64_t iErr3 =
+                                (long long)q2[j] - (long long)correct3i;
+                            int64_t iErr4 =
+                                (long long)q2[j] - (long long)correct4i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps
+                                       && iErr3 == 0))
+                                    && (!(fabsf(err3) <= f->double_ulps
+                                          && iErr4 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0));
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, %.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, %d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ 0x%16.16llx, 0x%8.8x})\n",
-                                    f->name, sizeNames[k], err, iErr,
-                                   ((double*) gIn)[j], ((double*) gIn2)[j],
-                                   ((cl_ulong*) gIn)[j], ((cl_ulong*) gIn2)[j],
-                                   ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j],
-                                   ((cl_ulong*) gOut_Ref)[j], ((cl_uint*) gOut_Ref2)[j],
-                                   test, q2[j],
-                                   ((cl_ulong*) q)[j], ((cl_uint*) q2)[j]);
-                      error = -1;
-                      goto exit;
+                        vlog_error(
+                            "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, "
+                            "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, "
+                            "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ "
+                            "0x%16.16llx, 0x%8.8x})\n",
+                            f->name, sizeNames[k], err, iErr,
+                            ((double *)gIn)[j], ((double *)gIn2)[j],
+                            ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j],
+                            ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j],
+                            ((cl_ulong *)gOut_Ref)[j],
+                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                            ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
 
-           fflush(stdout);
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            p[j] = DoubleFromUInt32( genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -1129,6 +1365,3 @@ exit:
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index f6bd1223..7e207379 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -33,60 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = ", name, "( f0 );\n"
-                            "       vstore3( i0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       int3 i0 = ", name, "( f0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -95,88 +112,109 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global double* in)\n"
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                        "       int3 i0 = ", name, "( f0 );\n"
-                        "       vstore3( i0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       double3 f0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       int3 i0 = ", name, "( f0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = i0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = i0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -187,12 +225,12 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ;
-    size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
@@ -206,191 +244,226 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j * scale;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         float *s = (float *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = f->func.i_f( s[j] );
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = f->func.i_f(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    if( ftz && IsFloatSubnormal(s[j]))
+                    if (ftz && IsFloatSubnormal(s[j]))
                     {
-                        unsigned int correct0 = f->func.i_f( 0.0 );
-                        unsigned int correct1 = f->func.i_f( -0.0 );
-                        if( q[j] == correct0 || q[j] == correct1 )
-                            continue;
+                        unsigned int correct0 = f->func.i_f(0.0);
+                        unsigned int correct1 = f->func.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
                     }
 
                     uint32_t err = t[j] - q[j];
-                    if( q[j] > t[j] )
-                        err = q[j] - t[j];
-                    vlog_error( "\nERROR: %s%s: %d ulp error at %a (0x%8.8x): *%d vs. %d\n", f->name, sizeNames[k], err, ((float*) gIn)[j], ((cl_uint*) gIn)[j], t[j], q[j] );
-                  error = -1;
-                  goto exit;
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
+                               "*%d vs. %d\n",
+                               f->name, sizeNames[k], err, ((float *)gIn)[j],
+                               ((cl_uint *)gIn)[j], t[j], q[j]);
+                    error = -1;
+                    goto exit;
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    vlog( "\n" );
+    vlog("\n");
 exit:
     RestoreFPState(&oldMode);
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -404,12 +477,12 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     int ftz = f->ftz || gForceFTZ;
-    size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -423,200 +496,231 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32( (uint32_t) i + j * scale );
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32( (uint32_t) i + j );
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
         double *s = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-            r[j] = f->dfunc.i_f( s[j] );
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            r[j] = f->dfunc.i_f(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    if( ftz && IsDoubleSubnormal(s[j]))
+                    if (ftz && IsDoubleSubnormal(s[j]))
                     {
-                        unsigned int correct0 = f->dfunc.i_f( 0.0 );
-                        unsigned int correct1 = f->dfunc.i_f( -0.0 );
-                        if( q[j] == correct0 || q[j] == correct1 )
-                            continue;
+                        unsigned int correct0 = f->dfunc.i_f(0.0);
+                        unsigned int correct1 = f->dfunc.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
                     }
 
                     uint32_t err = t[j] - q[j];
-                    if( q[j] > t[j] )
-                        err = q[j] - t[j];
-                    vlog_error( "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", f->name, sizeNames[k], err, ((double*) gIn)[j], t[j], q[j] );
-                  error = -1;
-                  goto exit;
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error(
+                        "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n",
+                        f->name, sizeNames[k], err, ((double *)gIn)[j], t[j],
+                        q[j]);
+                    error = -1;
+                    goto exit;
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
             if (gVerboseBruteForce)
             {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-            } else
-            {
-               vlog("." );
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
             }
-           fflush(stdout);
-
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-            p[j] = DoubleFromUInt32( genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 
 exit:
     RestoreFPState(&oldMode);
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -624,4 +728,3 @@ exit:
 
     return error;
 }
-
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index 1cde215c..52c4e96c 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -24,7 +24,8 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata, bool relaxedMode);
 extern const vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float,
                                     TestMacro_Int_Double_Double };
 
-static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p);
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode);
@@ -32,26 +33,42 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i] );\n"
-        "}\n"
-    };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in, __global float* in2)\n"
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in, __global float* in2)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
         "   {\n"
         "       float3 f0 = vload3( 0, in + 3 * i );\n"
         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = ", name, "( f0, f1 );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
         "       vstore3( i0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       float3 f0, f1;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -64,7 +81,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       int3 i0 = ", name, "( f0, f1 );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -80,16 +99,17 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
 
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -101,27 +121,43 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i] );\n"
-        "}\n"
-    };
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global long",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in, __global double* in2)\n"
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global long* out, __global double* in, __global double* in2)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
         "   {\n"
         "       double3 f0 = vload3( 0, in + 3 * i );\n"
         "       double3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       long3 l0 = ", name, "( f0, f1 );\n"
+        "       long3 l0 = ",
+        name,
+        "( f0, f1 );\n"
         "       vstore3( l0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       double3 f0, f1;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -134,7 +170,9 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         "               f1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       long3 l0 = ", name, "( f0, f1 );\n"
+        "       long3 l0 = ",
+        name,
+        "( f0, f1 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -149,17 +187,18 @@ static int BuildKernelDouble(const char *name, int vectorSize,
     };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -167,27 +206,31 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
@@ -197,72 +240,165 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f,  -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),  MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
 
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static const size_t specialValuesFloatCount = sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      inBuf2;                             // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    MTdata      d;
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
 
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -271,58 +407,79 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     }
 
     test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -335,393 +492,446 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
 
     // Run the kernels
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data  )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_float );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    fptr        func = job->f->func;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_int      *t,*r;
-    cl_float    *s,*s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_int *t, *r;
+    cl_float *s, *s2;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_int  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_int *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount;
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         float *fp = (float *)p;
         float *fp2 = (float *)p2;
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesFloatCount;
-    y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if( ++x >= specialValuesFloatCount )
+            if (++x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesFloatCount )
-                    break;
+                if (y >= specialValuesFloatCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
         p[j] = genrand_int32(d);
         p2[j] = genrand_int32(d);
     }
 
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_int *)gOut_Ref  + thread_id * buffer_elements;
-    s = (float *)gIn  + thread_id * buffer_elements;
-    s2 = (float *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = func.i_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
 
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                          0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_int *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
         cl_int *q = out[0];
 
-        if( gMinVectorSizeIndex == 0 && t[j] != q[j] )
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
-            if( ftz )
+            if (ftz)
             {
-                if( IsFloatSubnormal( s[j])  )
+                if (IsFloatSubnormal(s[j]))
                 {
-                    if( IsFloatSubnormal( s2[j] )  )
+                    if (IsFloatSubnormal(s2[j]))
                     {
-                        int correct = func.i_ff( 0.0f, 0.0f );
-                        int correct2 = func.i_ff( 0.0f, -0.0f );
-                        int correct3 = func.i_ff( -0.0f, 0.0f );
-                        int correct4 = func.i_ff( -0.0f, -0.0f );
+                        int correct = func.i_ff(0.0f, 0.0f);
+                        int correct2 = func.i_ff(0.0f, -0.0f);
+                        int correct3 = func.i_ff(-0.0f, 0.0f);
+                        int correct4 = func.i_ff(-0.0f, -0.0f);
 
-                        if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
                             continue;
                     }
                     else
                     {
-                        int correct = func.i_ff( 0.0f, s2[j] );
-                        int correct2 = func.i_ff( -0.0f, s2[j] );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int correct = func.i_ff(0.0f, s2[j]);
+                        int correct2 = func.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
-                else if( IsFloatSubnormal( s2[j] ) )
+                else if (IsFloatSubnormal(s2[j]))
                 {
-                    int correct = func.i_ff( s[j], 0.0f );
-                    int correct2 = func.i_ff( s[j], -0.0f );
-                    if( correct == q[j] || correct2 == q[j]  )
-                        continue;
+                    int correct = func.i_ff(s[j], 0.0f);
+                    int correct2 = func.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
                 }
-
             }
 
             uint32_t err = t[j] - q[j];
-            if( q[j] > t[j] )
-                err = q[j] - t[j];
-            vlog_error( "\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. 0x%8.8x (index: %d)\n", name, err, ((float*) s)[j], ((float*) s2)[j], t[j], q[j], j );
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
+                       "0x%8.8x (index: %d)\n",
+                       name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
+                       j);
             error = -1;
             goto exit;
         }
 
-        for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
-            if( -t[j] != q[j] )
+            if (-t[j] != q[j])
             {
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsFloatSubnormal( s[j])  )
+                    if (IsFloatSubnormal(s[j]))
                     {
-                        if( IsFloatSubnormal( s2[j] )  )
+                        if (IsFloatSubnormal(s2[j]))
                         {
-                            int correct = -func.i_ff( 0.0f, 0.0f );
-                            int correct2 = -func.i_ff( 0.0f, -0.0f );
-                            int correct3 = -func.i_ff( -0.0f, 0.0f );
-                            int correct4 = -func.i_ff( -0.0f, -0.0f );
+                            int correct = -func.i_ff(0.0f, 0.0f);
+                            int correct2 = -func.i_ff(0.0f, -0.0f);
+                            int correct3 = -func.i_ff(-0.0f, 0.0f);
+                            int correct4 = -func.i_ff(-0.0f, -0.0f);
 
-                            if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
                                 continue;
                         }
                         else
                         {
-                            int correct = -func.i_ff( 0.0f, s2[j] );
-                            int correct2 = -func.i_ff( -0.0f, s2[j] );
-                            if( correct == q[j] || correct2 == q[j]  )
-                                continue;
+                            int correct = -func.i_ff(0.0f, s2[j]);
+                            int correct2 = -func.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
                         }
                     }
-                    else if( IsFloatSubnormal( s2[j] ) )
+                    else if (IsFloatSubnormal(s2[j]))
                     {
-                        int correct = -func.i_ff( s[j], 0.0f );
-                        int correct2 = -func.i_ff( s[j], -0.0f );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int correct = -func.i_ff(s[j], 0.0f);
+                        int correct2 = -func.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
-
                 }
                 cl_uint err = -t[j] - q[j];
-                if( q[j] > -t[j] )
-                    err = q[j] + t[j];
-                vlog_error( "\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x vs. 0x%8.8x (index: %d)\n", name, sizeNames[k], err, ((float*) s)[j], ((float*) s2)[j], -t[j], q[j], j );
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
+                           "vs. 0x%8.8x (index: %d)\n",
+                           name, sizeNames[k], err, ((float *)s)[j],
+                           ((float *)s2)[j], -t[j], q[j], j);
                 error = -1;
                 goto exit;
             }
         }
     }
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
 exit:
@@ -731,50 +941,146 @@ exit:
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
 
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -785,58 +1091,79 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
-        test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
 
@@ -850,402 +1177,455 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input arrays
+        // Init input arrays
         uint64_t *p = (uint64_t *)gIn;
         uint64_t *p2 = (uint64_t *)gIn2;
-        for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
-            p[j] = (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
-            p2[j] = (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
+            p[j] =
+                (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
+            p2[j] =
+                (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
         }
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
     // Release
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             free_mtdata(test_info.tinfo[i].d);
             clReleaseMemObject(test_info.tinfo[i].inBuf);
             clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t      buffer_elements = job->subBufferSize;
-    size_t      buffer_size = buffer_elements * sizeof( cl_double );
-    cl_uint     base = job_id * (cl_uint) job->step;
-    ThreadInfo  *tinfo = job->tinfo + thread_id;
-    dptr        dfunc = job->f->dfunc;
-    int         ftz = job->ftz;
-    MTdata      d = tinfo->d;
-    cl_uint     j, k;
-    cl_int      error;
-    const char  *name = job->f->name;
-    cl_long     *t,*r;
-    cl_double   *s,*s2;
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    dptr dfunc = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_long *t, *r;
+    cl_double *s, *s2;
 
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_long  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_long *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
-    //Init input array
+    // Init input array
     double *p = (double *)gIn + thread_id * buffer_elements;
     double *p2 = (double *)gIn2 + thread_id * buffer_elements;
     j = 0;
-    int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
+    if (job_id <= (cl_uint)indx)
     { // test edge cases
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesDoubleCount;
-    y = (job_id * buffer_elements) / specialValuesDoubleCount;
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
 
-        for( ; j < buffer_elements; j++ )
+        for (; j < buffer_elements; j++)
         {
             p[j] = specialValuesDouble[x];
             p2[j] = specialValuesDouble[y];
-            if( ++x >= specialValuesDoubleCount )
+            if (++x >= specialValuesDoubleCount)
             {
                 x = 0;
                 y++;
-                if( y >= specialValuesDoubleCount )
-                    break;
+                if (y >= specialValuesDoubleCount) break;
             }
         }
     }
 
-    //Init any remaining values.
-    for( ; j < buffer_elements; j++ )
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
     {
-        ((cl_ulong*)p)[j] = genrand_int64(d);
-        ((cl_ulong*)p2)[j] = genrand_int64(d);
+        ((cl_ulong *)p)[j] = genrand_int64(d);
+        ((cl_ulong *)p2)[j] = genrand_int64(d);
     }
 
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         goto exit;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             goto exit;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             goto exit;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             goto exit;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             goto exit;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
-    r = (cl_long *)gOut_Ref  + thread_id * buffer_elements;
-    s = (cl_double *)gIn  + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2  + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = dfunc.i_ff( s[j], s2[j] );
+    // Calculate the correctly rounded reference result
+    r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_ff(s[j], s2[j]);
 
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             goto exit;
         }
     }
 
     // Wait for the last buffer
-    out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         goto exit;
     }
 
-    //Verify data
+    // Verify data
     t = (cl_long *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        cl_long *q = (cl_long *) out[0];
+        cl_long *q = (cl_long *)out[0];
 
         // If we aren't getting the correctly rounded result
-        if( gMinVectorSizeIndex == 0 && t[j] != q[j] )
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
-            if( ftz )
+            if (ftz)
             {
-                if( IsDoubleSubnormal( s[j])  )
+                if (IsDoubleSubnormal(s[j]))
                 {
-                    if( IsDoubleSubnormal( s2[j] )  )
+                    if (IsDoubleSubnormal(s2[j]))
                     {
-                        int64_t correct = dfunc.i_ff( 0.0f, 0.0f );
-                        int64_t correct2 = dfunc.i_ff( 0.0f, -0.0f );
-                        int64_t correct3 = dfunc.i_ff( -0.0f, 0.0f );
-                        int64_t correct4 = dfunc.i_ff( -0.0f, -0.0f );
+                        int64_t correct = dfunc.i_ff(0.0f, 0.0f);
+                        int64_t correct2 = dfunc.i_ff(0.0f, -0.0f);
+                        int64_t correct3 = dfunc.i_ff(-0.0f, 0.0f);
+                        int64_t correct4 = dfunc.i_ff(-0.0f, -0.0f);
 
-                        if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
                             continue;
                     }
                     else
                     {
-                        int64_t correct = dfunc.i_ff( 0.0f, s2[j] );
-                        int64_t correct2 = dfunc.i_ff( -0.0f, s2[j] );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int64_t correct = dfunc.i_ff(0.0f, s2[j]);
+                        int64_t correct2 = dfunc.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
-                else if( IsDoubleSubnormal( s2[j] ) )
+                else if (IsDoubleSubnormal(s2[j]))
                 {
-                    int64_t correct = dfunc.i_ff( s[j], 0.0f );
-                    int64_t correct2 = dfunc.i_ff( s[j], -0.0f );
-                    if( correct == q[j] || correct2 == q[j]  )
-                        continue;
+                    int64_t correct = dfunc.i_ff(s[j], 0.0f);
+                    int64_t correct2 = dfunc.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
                 }
-
             }
 
             uint64_t err = t[j] - q[j];
-            if( q[j] > t[j] )
-                err = q[j] - t[j];
-            vlog_error( "\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld vs. %lld  (index: %d)\n", name, err, ((double*) s)[j], ((double*) s2)[j], t[j], q[j], j );
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld "
+                       "vs. %lld  (index: %d)\n",
+                       name, err, ((double *)s)[j], ((double *)s2)[j], t[j],
+                       q[j], j);
             error = -1;
             goto exit;
         }
 
 
-        for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
-            q = (cl_long*) out[k];
+            q = (cl_long *)out[k];
             // If we aren't getting the correctly rounded result
-            if( -t[j] != q[j] )
+            if (-t[j] != q[j])
             {
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsDoubleSubnormal( s[j])  )
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        if( IsDoubleSubnormal( s2[j] )  )
+                        if (IsDoubleSubnormal(s2[j]))
                         {
-                            int64_t correct = -dfunc.i_ff( 0.0f, 0.0f );
-                            int64_t correct2 = -dfunc.i_ff( 0.0f, -0.0f );
-                            int64_t correct3 = -dfunc.i_ff( -0.0f, 0.0f );
-                            int64_t correct4 = -dfunc.i_ff( -0.0f, -0.0f );
+                            int64_t correct = -dfunc.i_ff(0.0f, 0.0f);
+                            int64_t correct2 = -dfunc.i_ff(0.0f, -0.0f);
+                            int64_t correct3 = -dfunc.i_ff(-0.0f, 0.0f);
+                            int64_t correct4 = -dfunc.i_ff(-0.0f, -0.0f);
 
-                            if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] )
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
                                 continue;
                         }
                         else
                         {
-                            int64_t correct = -dfunc.i_ff( 0.0f, s2[j] );
-                            int64_t correct2 = -dfunc.i_ff( -0.0f, s2[j] );
-                            if( correct == q[j] || correct2 == q[j]  )
-                                continue;
+                            int64_t correct = -dfunc.i_ff(0.0f, s2[j]);
+                            int64_t correct2 = -dfunc.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
                         }
                     }
-                    else if( IsDoubleSubnormal( s2[j] ) )
+                    else if (IsDoubleSubnormal(s2[j]))
                     {
-                        int64_t correct = -dfunc.i_ff( s[j], 0.0f );
-                        int64_t correct2 = -dfunc.i_ff( s[j], -0.0f );
-                        if( correct == q[j] || correct2 == q[j]  )
-                            continue;
+                        int64_t correct = -dfunc.i_ff(s[j], 0.0f);
+                        int64_t correct2 = -dfunc.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
-
                 }
 
                 uint64_t err = -t[j] - q[j];
-                if( q[j] > -t[j] )
-                    err = q[j] + t[j];
-                vlog_error( "\nERROR: %sD%s: %lld ulp error at {%.13la, %.13la}: *%lld vs. %lld  (index: %d)\n", name, sizeNames[k], err, ((double*) s)[j], ((double*) s2)[j], -t[j], q[j], j );
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %sD%s: %lld ulp error at {%.13la, "
+                           "%.13la}: *%lld vs. %lld  (index: %d)\n",
+                           name, sizeNames[k], err, ((double *)s)[j],
+                           ((double *)s2)[j], -t[j], q[j], j);
                 error = -1;
                 goto exit;
             }
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
 exit:
     return error;
 }
-
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index 70f724ce..26a186f6 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -33,60 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 i0 = ", name, "( f0 );\n"
-                            "       vstore3( i0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       int3 i0;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       i0 = ", name, "( f0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = i0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = i0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 i0;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -97,62 +114,79 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in)\n"
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global long",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                        "       long3 l0 = ", name, "( d0 );\n"
-                        "       vstore3( l0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       double3 d0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       long3 l0 = ", name, "( d0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = l0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = l0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global long* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       vstore3( l0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = l0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = l0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -160,80 +194,90 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
                              info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
 
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
-    if (gWimpyMode )
+    if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -242,51 +286,68 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     }
 
     test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gOutBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
     }
@@ -297,281 +358,315 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         cl_uint *p = (cl_uint *)gIn;
-        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_float );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr    func = job->f->func;
-    int     ftz = job->ftz;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
     cl_uint j, k;
     cl_int error = CL_SUCCESS;
-    cl_int ret   = CL_SUCCESS;
+    cl_int ret = CL_SUCCESS;
     const char *name = job->f->name;
 
     int signbit_test = 0;
-    if(!strcmp(name, "signbit"))
-        signbit_test = 1;
+    if (!strcmp(name, "signbit")) signbit_test = 1;
 
-    #define ref_func(s) ( signbit_test ? func.i_f_f( s ) : func.i_f( s ) )
+#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_int  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_int *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        p[j] = base + j * scale;
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
     float *s = (float *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = ref_func( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                          0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
-    //Verify data
+    // Verify data
     cl_int *t = (cl_int *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_int *q = out[0];
 
             // If we aren't getting the correctly rounded result
-            if( gMinVectorSizeIndex == 0 && t[j] != q[j])
+            if (gMinVectorSizeIndex == 0 && t[j] != q[j])
             {
                 // If we aren't getting the correctly rounded result
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsFloatSubnormal( s[j]) )
+                    if (IsFloatSubnormal(s[j]))
                     {
-                        int correct = ref_func( +0.0f );
-                        int correct2 = ref_func( -0.0f );
-                        if( correct == q[j] || correct2 == q[j] )
-                            continue;
+                        int correct = ref_func(+0.0f);
+                        int correct2 = ref_func(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
 
                 uint32_t err = t[j] - q[j];
-                if( q[j] > t[j] )
-                    err = q[j] - t[j];
-                vlog_error( "\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", name,  err, ((float*) s)[j], t[j], q[j] );
+                if (q[j] > t[j]) err = q[j] - t[j];
+                vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n",
+                           name, err, ((float *)s)[j], t[j], q[j]);
                 error = -1;
                 goto exit;
             }
 
 
-            for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+            for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
             {
                 q = out[k];
                 // If we aren't getting the correctly rounded result
-                if( -t[j] != q[j] )
+                if (-t[j] != q[j])
                 {
-                    if( ftz )
+                    if (ftz)
                     {
-                        if( IsFloatSubnormal( s[j]))
+                        if (IsFloatSubnormal(s[j]))
                         {
-                            int correct = -ref_func( +0.0f );
-                            int correct2 = -ref_func( -0.0f );
-                            if( correct == q[j] || correct2 == q[j] )
-                                continue;
+                            int correct = -ref_func(+0.0f);
+                            int correct2 = -ref_func(-0.0f);
+                            if (correct == q[j] || correct2 == q[j]) continue;
                         }
                     }
 
                     uint32_t err = -t[j] - q[j];
-                    if( q[j] > -t[j] )
-                        err = q[j] + t[j];
-                    vlog_error( "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, sizeNames[k], err, ((float*) s)[j], -t[j], q[j] );
-                  error = -1;
-                  goto exit;
+                    if (q[j] > -t[j]) err = q[j] + t[j];
+                    vlog_error(
+                        "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name,
+                        sizeNames[k], err, ((float *)s)[j], -t[j], q[j]);
+                    error = -1;
+                    goto exit;
                 }
             }
         }
@@ -579,60 +674,69 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
 
 exit:
     ret = error;
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
+    if ((error = clFlush(tinfo->tQueue)))
     {
-        vlog( "clFlush 3 failed\n" );
+        vlog("clFlush 3 failed\n");
         return error;
     }
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
     return ret;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data );
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
 
 int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
-    if (gWimpyMode )
+    if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -643,52 +747,69 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
     }
@@ -699,117 +820,131 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         cl_ulong *p = (cl_ulong *)gIn;
-        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
             p[j] = DoubleFromUInt32(genrand_int32(d));
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_double );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    dptr    dfunc = job->f->dfunc;
+    dptr dfunc = job->f->dfunc;
     cl_uint j, k;
     cl_int error;
     int ftz = job->ftz;
@@ -818,189 +953,209 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_long *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_long *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_double *p = (cl_double*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        p[j] = DoubleFromUInt32( base + j * scale);
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
     cl_double *s = (cl_double *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = dfunc.i_f( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
 
-    //Verify data
+    // Verify data
     cl_long *t = (cl_long *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
         cl_long *q = out[0];
 
 
         // If we aren't getting the correctly rounded result
-        if( gMinVectorSizeIndex == 0 && t[j] != q[j])
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
             // If we aren't getting the correctly rounded result
-            if( ftz )
+            if (ftz)
             {
-                if( IsDoubleSubnormal( s[j]) )
+                if (IsDoubleSubnormal(s[j]))
                 {
-                    cl_long correct = dfunc.i_f( +0.0f );
-                    cl_long correct2 = dfunc.i_f( -0.0f );
-                    if( correct == q[j] || correct2 == q[j] )
-                        continue;
+                    cl_long correct = dfunc.i_f(+0.0f);
+                    cl_long correct2 = dfunc.i_f(-0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
                 }
             }
 
             cl_ulong err = t[j] - q[j];
-            if( q[j] > t[j] )
-                err = q[j] - t[j];
-            vlog_error( "\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", name,  err, ((double*) gIn)[j], t[j], q[j] );
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                       name, err, ((double *)gIn)[j], t[j], q[j]);
             return -1;
         }
 
 
-        for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
         {
             q = out[k];
             // If we aren't getting the correctly rounded result
-            if( -t[j] != q[j] )
+            if (-t[j] != q[j])
             {
-                if( ftz )
+                if (ftz)
                 {
-                    if( IsDoubleSubnormal( s[j]))
+                    if (IsDoubleSubnormal(s[j]))
                     {
-                        int64_t correct = -dfunc.i_f( +0.0f );
-                        int64_t correct2 = -dfunc.i_f( -0.0f );
-                        if( correct == q[j] || correct2 == q[j] )
-                            continue;
+                        int64_t correct = -dfunc.i_f(+0.0f);
+                        int64_t correct2 = -dfunc.i_f(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
                     }
                 }
 
                 cl_ulong err = -t[j] - q[j];
-                if( q[j] > -t[j] )
-                    err = q[j] + t[j];
-                vlog_error( "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", name, sizeNames[k], err, ((double*) gIn)[j], -t[j], q[j] );
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error(
+                    "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                    name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
                 return -1;
             }
         }
-
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
-       if (gVerboseBruteForce)
-       {
-           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
-       } else
-       {
-          vlog("." );
-       }
-       fflush(stdout);
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
     }
 
     return CL_SUCCESS;
 }
-
-
-
-
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index ed1d7d53..9292649a 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -31,66 +31,87 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2,  __global float", sizeNames[vectorSize], "* in3 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2, __global float* in3)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-                            "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-                            "       f0 = ", name, "( f0, f1, f2 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0, f1, f2;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-                            "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, f1, f2 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2,  __global float",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2, "
+        "__global float* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0, f1, f2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -98,94 +119,119 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2,  __global double", sizeNames[vectorSize], "* in3 )\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2, __global double* in3)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 d0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-                            "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-                            "       d0 = ", name, "( d0, d1, d2 );\n"
-                            "       vstore3( d0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 d0, d1, d2;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-                            "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-                            "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       d0 = ", name, "( d0, d1, d2 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = d0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = d0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2,  __global double",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2, "
+        "__global double* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0, d1, d2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -199,232 +245,356 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-//    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    //    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM &
+    //    gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
         float *s3 = (float *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = (float) f->func.f_fff( s[j], s2[j], s3[j] );
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data  -- Commented out on purpose. no verification possible. MAD is a random number generator.
-/*
-        uint32_t *t = gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-        {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
-            {
-                uint32_t *q = gOut[k];
-
-                // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+        // Verify data  -- Commented out on purpose. no verification possible.
+        // MAD is a random number generator.
+        /*
+                uint32_t *t = gOut_Ref;
+                for( j = 0; j < bufferSize / sizeof( float ); j++ )
                 {
-                    float test = ((float*) q)[j];
-                    double correct = f->func.f_fff( s[j], s2[j], s3[j] );
-                    float err = Ulp_Error( test, correct );
-                    int fail = ! (fabsf(err) <= f->float_ulps);
-
-                    if( fail && ftz )
+                    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
                     {
-                        // retry per section 6.5.3.2
-                        if( IsFloatSubnormal(correct) )
-                        { // look at me,
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
-                        }
+                        uint32_t *q = gOut[k];
 
-                        // retry per section 6.5.3.3
-                        if( fail && IsFloatSubnormal( s[j] ) )
-                        { // look at me,
-                            double correct2 = f->func.f_fff( 0.0, s2[j], s3[j] );
-                            double correct3 = f->func.f_fff( -0.0, s2[j], s3[j] );
-                            float err2 = Ulp_Error( test, correct2  );
-                            float err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                        // If we aren't getting the correctly rounded result
+                        if( t[j] != q[j] )
+                        {
+                            float test = ((float*) q)[j];
+                            double correct = f->func.f_fff( s[j], s2[j], s3[j]
+         ); float err = Ulp_Error( test, correct ); int fail = ! (fabsf(err) <=
+         f->float_ulps);
 
-                            // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                            { // look at me now,
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
-
-                            //try with first two args as zero
-                            if( IsFloatSubnormal( s2[j] ) )
-                            { // its fun to have fun,
-                                correct2 = f->func.f_fff( 0.0, 0.0, s3[j] );
-                                correct3 = f->func.f_fff( -0.0, 0.0, s3[j] );
-                                double correct4 = f->func.f_fff( 0.0, -0.0, s3[j] );
-                                double correct5 = f->func.f_fff( -0.0, -0.0, s3[j] );
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                float err4 = Ulp_Error( test, correct4  );
-                                float err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                 (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
-                                    IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
+                            if( fail && ftz )
+                            {
+                                // retry per section 6.5.3.2
+                                if( IsFloatSubnormal(correct) )
+                                { // look at me,
+                                    fail = fail && ( test != 0.0f );
                                     if( ! fail )
                                         err = 0.0f;
                                 }
 
-                                if( IsFloatSubnormal( s3[j] )  )
-                                { // but you have to know how!
-                                    correct2 = f->func.f_fff( 0.0, 0.0, 0.0f );
-                                    correct3 = f->func.f_fff( -0.0, 0.0, 0.0f );
-                                    correct4 = f->func.f_fff( 0.0, -0.0, 0.0f );
-                                    correct5 = f->func.f_fff( -0.0, -0.0, 0.0f );
-                                    double correct6 = f->func.f_fff( 0.0, 0.0, -0.0f );
-                                    double correct7 = f->func.f_fff( -0.0, 0.0, -0.0f );
-                                    double correct8 = f->func.f_fff( 0.0, -0.0, -0.0f );
-                                    double correct9 = f->func.f_fff( -0.0, -0.0, -0.0f );
-                                    err2 = Ulp_Error( test, correct2  );
-                                    err3 = Ulp_Error( test, correct3  );
-                                    err4 = Ulp_Error( test, correct4  );
-                                    err5 = Ulp_Error( test, correct5  );
-                                    float err6 = Ulp_Error( test, correct6  );
-                                    float err7 = Ulp_Error( test, correct7  );
-                                    float err8 = Ulp_Error( test, correct8  );
-                                    float err9 = Ulp_Error( test, correct9  );
-                                    fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                     (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)) &&
-                                                     (!(fabsf(err5) <= f->float_ulps)) && (!(fabsf(err6) <= f->float_ulps)) &&
-                                                     (!(fabsf(err7) <= f->float_ulps)) && (!(fabsf(err8) <= f->float_ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
+                                // retry per section 6.5.3.3
+                                if( fail && IsFloatSubnormal( s[j] ) )
+                                { // look at me,
+                                    double correct2 = f->func.f_fff( 0.0, s2[j],
+         s3[j] ); double correct3 = f->func.f_fff( -0.0, s2[j], s3[j] ); float
+         err2 = Ulp_Error( test, correct2  ); float err3 = Ulp_Error( test,
+         correct3  ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) &&
+         (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) )
                                         err = err2;
                                     if( fabsf( err3 ) < fabsf(err ) )
                                         err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
-                                    if( fabsf( err6 ) < fabsf(err ) )
-                                        err = err6;
-                                    if( fabsf( err7 ) < fabsf(err ) )
-                                        err = err7;
-                                    if( fabsf( err8 ) < fabsf(err ) )
-                                        err = err8;
-                                    if( fabsf( err9 ) < fabsf(err ) )
-                                        err = err9;
 
                                     // retry per section 6.5.3.4
-                                    if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
-                                        IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps )  ||
-                                        IsFloatResultSubnormal( correct6, f->float_ulps ) || IsFloatResultSubnormal(correct7, f->float_ulps )  ||
-                                        IsFloatResultSubnormal(correct8, f->float_ulps ) || IsFloatResultSubnormal( correct9, f->float_ulps )  )
+                                    if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
+                                    { // look at me now,
+                                        fail = fail && ( test != 0.0f);
+                                        if( ! fail )
+                                            err = 0.0f;
+                                    }
+
+                                    //try with first two args as zero
+                                    if( IsFloatSubnormal( s2[j] ) )
+                                    { // its fun to have fun,
+                                        correct2 = f->func.f_fff( 0.0, 0.0,
+         s3[j] ); correct3 = f->func.f_fff( -0.0, 0.0, s3[j] ); double correct4
+         = f->func.f_fff( 0.0, -0.0, s3[j] ); double correct5 = f->func.f_fff(
+         -0.0, -0.0, s3[j] ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
+                                        float err5 = Ulp_Error( test, correct5
+         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
+         <= f->float_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
+                                            IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+
+                                        if( IsFloatSubnormal( s3[j] )  )
+                                        { // but you have to know how!
+                                            correct2 = f->func.f_fff( 0.0, 0.0,
+         0.0f ); correct3 = f->func.f_fff( -0.0, 0.0, 0.0f ); correct4 =
+         f->func.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->func.f_fff( -0.0, -0.0,
+         0.0f ); double correct6 = f->func.f_fff( 0.0, 0.0, -0.0f ); double
+         correct7 = f->func.f_fff( -0.0, 0.0, -0.0f ); double correct8 =
+         f->func.f_fff( 0.0, -0.0, -0.0f ); double correct9 = f->func.f_fff(
+         -0.0, -0.0, -0.0f ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); err4 = Ulp_Error( test, correct4  ); err5
+         = Ulp_Error( test, correct5  ); float err6 = Ulp_Error( test, correct6
+         ); float err7 = Ulp_Error( test, correct7  ); float err8 = Ulp_Error(
+         test, correct8  ); float err9 = Ulp_Error( test, correct9  ); fail =
+         fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <=
+         f->float_ulps)) &&
+                                                             (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)) &&
+                                                             (!(fabsf(err5) <=
+         f->float_ulps)) && (!(fabsf(err6) <= f->float_ulps)) &&
+                                                             (!(fabsf(err7) <=
+         f->float_ulps)) && (!(fabsf(err8) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5; if( fabsf( err6 ) < fabsf(err ) ) err = err6;
+                                            if( fabsf( err7 ) < fabsf(err ) )
+                                                err = err7;
+                                            if( fabsf( err8 ) < fabsf(err ) )
+                                                err = err8;
+                                            if( fabsf( err9 ) < fabsf(err ) )
+                                                err = err9;
+
+                                            // retry per section 6.5.3.4
+                                            if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
+                                                IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps )  ||
+                                                IsFloatResultSubnormal(
+         correct6, f->float_ulps ) || IsFloatResultSubnormal(correct7,
+         f->float_ulps )  || IsFloatResultSubnormal(correct8, f->float_ulps ) ||
+         IsFloatResultSubnormal( correct9, f->float_ulps )  )
+                                            {
+                                                fail = fail && ( test != 0.0f);
+                                                if( ! fail )
+                                                    err = 0.0f;
+                                            }
+                                        }
+                                    }
+                                    else if( IsFloatSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->func.f_fff( 0.0, s2[j],
+         0.0 ); correct3 = f->func.f_fff( -0.0, s2[j], 0.0 ); double correct4 =
+         f->func.f_fff( 0.0,  s2[j], -0.0 ); double correct5 = f->func.f_fff(
+         -0.0, s2[j], -0.0 ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
+                                        float err5 = Ulp_Error( test, correct5
+         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
+         <= f->float_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
+                                            IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
+                                }
+                                else if( fail && IsFloatSubnormal( s2[j] ) )
+                                {
+                                    double correct2 = f->func.f_fff( s[j], 0.0,
+         s3[j] ); double correct3 = f->func.f_fff( s[j], -0.0, s3[j] ); float
+         err2 = Ulp_Error( test, correct2  ); float err3 = Ulp_Error( test,
+         correct3  ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) &&
+         (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) )
+                                        err = err2;
+                                    if( fabsf( err3 ) < fabsf(err ) )
+                                        err = err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsFloatResultSubnormal(correct2,
+         f->float_ulps )  || IsFloatResultSubnormal(correct3, f->float_ulps ) )
+                                    {
+                                        fail = fail && ( test != 0.0f);
+                                        if( ! fail )
+                                            err = 0.0f;
+                                    }
+
+                                    //try with second two args as zero
+                                    if( IsFloatSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->func.f_fff( s[j], 0.0, 0.0
+         ); correct3 = f->func.f_fff( s[j], -0.0, 0.0 ); double correct4 =
+         f->func.f_fff( s[j], 0.0, -0.0 ); double correct5 = f->func.f_fff(
+         s[j], -0.0, -0.0 ); err2 = Ulp_Error( test, correct2  ); err3 =
+         Ulp_Error( test, correct3  ); float err4 = Ulp_Error( test, correct4 );
+                                        float err5 = Ulp_Error( test, correct5
+         ); fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3)
+         <= f->float_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) <
+         fabsf(err ) ) err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
+                                            IsFloatResultSubnormal(correct4,
+         f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
+                                }
+                                else if( fail && IsFloatSubnormal(s3[j]) )
+                                {
+                                    double correct2 = f->func.f_fff( s[j],
+         s2[j], 0.0 ); double correct3 = f->func.f_fff( s[j], s2[j], -0.0 );
+                                    float err2 = Ulp_Error( test, correct2  );
+                                    float err3 = Ulp_Error( test, correct3  );
+                                    fail =  fail && ((!(fabsf(err2) <=
+         f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2
+         ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err =
+         err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsFloatResultSubnormal(correct2,
+         f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
                                     {
                                         fail = fail && ( test != 0.0f);
                                         if( ! fail )
@@ -432,222 +602,146 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
                                     }
                                 }
                             }
-                            else if( IsFloatSubnormal( s3[j] ) )
-                            {
-                                correct2 = f->func.f_fff( 0.0, s2[j], 0.0 );
-                                correct3 = f->func.f_fff( -0.0, s2[j], 0.0 );
-                                double correct4 = f->func.f_fff( 0.0,  s2[j], -0.0 );
-                                double correct5 = f->func.f_fff( -0.0, s2[j], -0.0 );
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                float err4 = Ulp_Error( test, correct4  );
-                                float err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                 (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
 
-                                // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps )  ||
-                                    IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
-                                }
+                            if( fabsf(err ) > maxError )
+                            {
+                                maxError = fabsf(err);
+                                maxErrorVal = s[j];
+                                maxErrorVal2 = s2[j];
+                                maxErrorVal3 = s3[j];
+                            }
+
+                            if( fail )
+                            {
+                                vlog_error( "\nERROR: %s%s: %f ulp error at {%a,
+         %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j],
+         ((float*) gOut_Ref)[j], test ); error = -1; goto exit;
                             }
                         }
-                        else if( fail && IsFloatSubnormal( s2[j] ) )
-                        {
-                            double correct2 = f->func.f_fff( s[j], 0.0, s3[j] );
-                            double correct3 = f->func.f_fff( s[j], -0.0, s3[j] );
-                            float err2 = Ulp_Error( test, correct2  );
-                            float err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, f->float_ulps )  || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                            {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
-
-                            //try with second two args as zero
-                            if( IsFloatSubnormal( s3[j] ) )
-                            {
-                                correct2 = f->func.f_fff( s[j], 0.0, 0.0 );
-                                correct3 = f->func.f_fff( s[j], -0.0, 0.0 );
-                                double correct4 = f->func.f_fff( s[j], 0.0, -0.0 );
-                                double correct5 = f->func.f_fff( s[j], -0.0, -0.0 );
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                float err4 = Ulp_Error( test, correct4  );
-                                float err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) &&
-                                                 (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ||
-                                    IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
-                                }
-                            }
-                        }
-                        else if( fail && IsFloatSubnormal(s3[j]) )
-                        {
-                            double correct2 = f->func.f_fff( s[j], s2[j], 0.0 );
-                            double correct3 = f->func.f_fff( s[j], s2[j], -0.0 );
-                            float err2 = Ulp_Error( test, correct2  );
-                            float err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) )
-                            {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
-                        }
-                    }
-
-                    if( fabsf(err ) > maxError )
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                        maxErrorVal2 = s2[j];
-                        maxErrorVal3 = s3[j];
-                    }
-
-                    if( fail )
-                    {
-                        vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((float*) gOut_Ref)[j], test );
- error = -1;
- goto exit;
                     }
                 }
-            }
-        }
-*/
-        if( 0 == (i & 0x0fffffff) )
+        */
+        if (0 == (i & 0x0fffffff))
         {
-            vlog("." );
+            vlog(".");
             fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "pass" );
+            vlog("pass");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -661,14 +755,14 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-//    int ftz = f->ftz || gForceFTZ;
+    //    int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     uint64_t step = getTestStep(sizeof(double), bufferSize);
@@ -676,223 +770,363 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            r[j] = (double) f->dfunc.f_fff( s[j], s2[j], s3[j] );
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data  -- Commented out on purpose. no verification possible. MAD is a random number generator.
-/*
-        uint64_t *t = gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-        {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
-            {
-                uint64_t *q = gOut[k];
-
-                // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+        // Verify data  -- Commented out on purpose. no verification possible.
+        // MAD is a random number generator.
+        /*
+                uint64_t *t = gOut_Ref;
+                for( j = 0; j < bufferSize / sizeof( double ); j++ )
                 {
-                    double test = ((double*) q)[j];
-                    long double correct = f->dfunc.f_fff( s[j], s2[j], s3[j] );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    int fail = ! (fabsf(err) <= f->double_ulps);
-
-                    if( fail && ftz )
+                    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
                     {
-                        // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps) )
-                        { // look at me,
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
-                        }
+                        uint64_t *q = gOut[k];
 
-                        // retry per section 6.5.3.3
-                        if( fail && IsDoubleSubnormal( s[j] ) )
-                        { // look at me,
-                            long double correct2 = f->dfunc.f_fff( 0.0, s2[j], s3[j] );
-                            long double correct3 = f->dfunc.f_fff( -0.0, s2[j], s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                        // If we aren't getting the correctly rounded result
+                        if( t[j] != q[j] )
+                        {
+                            double test = ((double*) q)[j];
+                            long double correct = f->dfunc.f_fff( s[j], s2[j],
+         s3[j] ); float err = Bruteforce_Ulp_Error_Double( test, correct ); int
+         fail = ! (fabsf(err) <= f->double_ulps);
 
-                            // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
-                            { // look at me now,
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
-
-                            //try with first two args as zero
-                            if( IsDoubleSubnormal( s2[j] ) )
-                            { // its fun to have fun,
-                                correct2 = f->dfunc.f_fff( 0.0, 0.0, s3[j] );
-                                correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] );
-                                long double correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] );
-                                long double correct5 = f->dfunc.f_fff( -0.0, -0.0, s3[j] );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                    IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                            if( fail && ftz )
+                            {
+                                // retry per section 6.5.3.2
+                                if( IsDoubleResultSubnormal(correct,
+         f->double_ulps) ) { // look at me, fail = fail && ( test != 0.0f ); if(
+         ! fail ) err = 0.0f;
                                 }
 
-                                if( IsDoubleSubnormal( s3[j] )  )
-                                { // but you have to know how!
-                                    correct2 = f->dfunc.f_fff( 0.0, 0.0, 0.0f );
-                                    correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f );
-                                    correct4 = f->dfunc.f_fff( 0.0, -0.0, 0.0f );
-                                    correct5 = f->dfunc.f_fff( -0.0, -0.0, 0.0f );
-                                    long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f );
-                                    long double correct7 = f->dfunc.f_fff( -0.0, 0.0, -0.0f );
-                                    long double correct8 = f->dfunc.f_fff( 0.0, -0.0, -0.0f );
-                                    long double correct9 = f->dfunc.f_fff( -0.0, -0.0, -0.0f );
-                                    err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                    err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                    err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                    err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                    float err6 = Bruteforce_Ulp_Error_Double( test, correct6  );
-                                    float err7 = Bruteforce_Ulp_Error_Double( test, correct7  );
-                                    float err8 = Bruteforce_Ulp_Error_Double( test, correct8  );
-                                    float err9 = Bruteforce_Ulp_Error_Double( test, correct9  );
-                                    fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                     (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) &&
-                                                     (!(fabsf(err5) <= f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) &&
-                                                     (!(fabsf(err7) <= f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
-                                    if( fabsf( err6 ) < fabsf(err ) )
-                                        err = err6;
-                                    if( fabsf( err7 ) < fabsf(err ) )
-                                        err = err7;
-                                    if( fabsf( err8 ) < fabsf(err ) )
-                                        err = err8;
-                                    if( fabsf( err9 ) < fabsf(err ) )
-                                        err = err9;
+                                // retry per section 6.5.3.3
+                                if( fail && IsDoubleSubnormal( s[j] ) )
+                                { // look at me,
+                                    long double correct2 = f->dfunc.f_fff( 0.0,
+         s2[j], s3[j] ); long double correct3 = f->dfunc.f_fff( -0.0, s2[j],
+         s3[j] ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
+                                    float err3 = Bruteforce_Ulp_Error_Double(
+         test, correct3  ); fail =  fail && ((!(fabsf(err2) <= f->double_ulps))
+         && (!(fabsf(err3) <= f->double_ulps))); if( fabsf( err2 ) < fabsf(err )
+         ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = err3;
 
                                     // retry per section 6.5.3.4
-                                    if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                        IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps )  ||
-                                        IsDoubleResultSubnormal( correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7, f->double_ulps )  ||
-                                        IsDoubleResultSubnormal( correct8, f->double_ulps ) || IsDoubleResultSubnormal( correct9, f->double_ulps )  )
+                                    if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         ) { // look at me now, fail = fail && ( test != 0.0f); if( ! fail ) err
+         = 0.0f;
+                                    }
+
+                                    //try with first two args as zero
+                                    if( IsDoubleSubnormal( s2[j] ) )
+                                    { // its fun to have fun,
+                                        correct2 = f->dfunc.f_fff( 0.0, 0.0,
+         s3[j] ); correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] ); long double
+         correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] ); long double correct5 =
+         f->dfunc.f_fff( -0.0, -0.0, s3[j] ); err2 =
+         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
+         Bruteforce_Ulp_Error_Double( test, correct3  ); float err4 =
+         Bruteforce_Ulp_Error_Double( test, correct4  ); float err5 =
+         Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                            err = err3;
+                                        if( fabsf( err4 ) < fabsf(err ) )
+                                            err = err4;
+                                        if( fabsf( err5 ) < fabsf(err ) )
+                                            err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
+         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+
+                                        if( IsDoubleSubnormal( s3[j] )  )
+                                        { // but you have to know how!
+                                            correct2 = f->dfunc.f_fff( 0.0, 0.0,
+         0.0f ); correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f ); correct4 =
+         f->dfunc.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->dfunc.f_fff( -0.0,
+         -0.0, 0.0f ); long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f );
+                                            long double correct7 =
+         f->dfunc.f_fff( -0.0, 0.0, -0.0f ); long double correct8 =
+         f->dfunc.f_fff( 0.0, -0.0, -0.0f ); long double correct9 =
+         f->dfunc.f_fff( -0.0, -0.0, -0.0f ); err2 =
+         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
+         Bruteforce_Ulp_Error_Double( test, correct3  ); err4 =
+         Bruteforce_Ulp_Error_Double( test, correct4  ); err5 =
+         Bruteforce_Ulp_Error_Double( test, correct5  ); float err6 =
+         Bruteforce_Ulp_Error_Double( test, correct6  ); float err7 =
+         Bruteforce_Ulp_Error_Double( test, correct7  ); float err8 =
+         Bruteforce_Ulp_Error_Double( test, correct8  ); float err9 =
+         Bruteforce_Ulp_Error_Double( test, correct9  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                             (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) &&
+                                                             (!(fabsf(err5) <=
+         f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) &&
+                                                             (!(fabsf(err7) <=
+         f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                                err = err3;
+                                            if( fabsf( err4 ) < fabsf(err ) )
+                                                err = err4;
+                                            if( fabsf( err5 ) < fabsf(err ) )
+                                                err = err5;
+                                            if( fabsf( err6 ) < fabsf(err ) )
+                                                err = err6;
+                                            if( fabsf( err7 ) < fabsf(err ) )
+                                                err = err7;
+                                            if( fabsf( err8 ) < fabsf(err ) )
+                                                err = err8;
+                                            if( fabsf( err9 ) < fabsf(err ) )
+                                                err = err9;
+
+                                            // retry per section 6.5.3.4
+                                            if( IsDoubleResultSubnormal(
+         correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3,
+         f->double_ulps )  || IsDoubleResultSubnormal( correct4, f->double_ulps
+         ) || IsDoubleResultSubnormal( correct5, f->double_ulps )  ||
+                                                IsDoubleResultSubnormal(
+         correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7,
+         f->double_ulps )  || IsDoubleResultSubnormal( correct8, f->double_ulps
+         ) || IsDoubleResultSubnormal( correct9, f->double_ulps )  )
+                                            {
+                                                fail = fail && ( test != 0.0f);
+                                                if( ! fail )
+                                                    err = 0.0f;
+                                            }
+                                        }
+                                    }
+                                    else if( IsDoubleSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->dfunc.f_fff( 0.0, s2[j],
+         0.0 ); correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 ); long double
+         correct4 = f->dfunc.f_fff( 0.0,  s2[j], -0.0 ); long double correct5 =
+         f->dfunc.f_fff( -0.0, s2[j], -0.0 ); err2 =
+         Bruteforce_Ulp_Error_Double( test, correct2  ); err3 =
+         Bruteforce_Ulp_Error_Double( test, correct3  ); float err4 =
+         Bruteforce_Ulp_Error_Double( test, correct4  ); float err5 =
+         Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                            err = err3;
+                                        if( fabsf( err4 ) < fabsf(err ) )
+                                            err = err4;
+                                        if( fabsf( err5 ) < fabsf(err ) )
+                                            err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
+         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
+                                }
+                                else if( fail && IsDoubleSubnormal( s2[j] ) )
+                                {
+                                    long double correct2 = f->dfunc.f_fff( s[j],
+         0.0, s3[j] ); long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j]
+         ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  ); float
+         err3 = Bruteforce_Ulp_Error_Double( test, correct3  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if(
+         fabsf( err3 ) < fabsf(err ) ) err = err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps )  || IsDoubleResultSubnormal( correct3, f->double_ulps
+         ) )
+                                    {
+                                        fail = fail && ( test != 0.0f);
+                                        if( ! fail )
+                                            err = 0.0f;
+                                    }
+
+                                    //try with second two args as zero
+                                    if( IsDoubleSubnormal( s3[j] ) )
+                                    {
+                                        correct2 = f->dfunc.f_fff( s[j], 0.0,
+         0.0 ); correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 ); long double
+         correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 ); long double correct5 =
+         f->dfunc.f_fff( s[j], -0.0, -0.0 ); err2 = Bruteforce_Ulp_Error_Double(
+         test, correct2  ); err3 = Bruteforce_Ulp_Error_Double( test, correct3
+         ); float err4 = Bruteforce_Ulp_Error_Double( test, correct4  ); float
+         err5 = Bruteforce_Ulp_Error_Double( test, correct5  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps)) &&
+                                                         (!(fabsf(err4) <=
+         f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf(
+         err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) )
+                                            err = err3;
+                                        if( fabsf( err4 ) < fabsf(err ) )
+                                            err = err4;
+                                        if( fabsf( err5 ) < fabsf(err ) )
+                                            err = err5;
+
+                                        // retry per section 6.5.3.4
+                                        if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         || IsDoubleResultSubnormal( correct4, f->double_ulps ) ||
+         IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                        {
+                                            fail = fail && ( test != 0.0f);
+                                            if( ! fail )
+                                                err = 0.0f;
+                                        }
+                                    }
+                                }
+                                else if( fail && IsDoubleSubnormal(s3[j]) )
+                                {
+                                    long double correct2 = f->dfunc.f_fff( s[j],
+         s2[j], 0.0 ); long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0
+         ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2  ); float
+         err3 = Bruteforce_Ulp_Error_Double( test, correct3  ); fail =  fail &&
+         ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <=
+         f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if(
+         fabsf( err3 ) < fabsf(err ) ) err = err3;
+
+                                    // retry per section 6.5.3.4
+                                    if( IsDoubleResultSubnormal( correct2,
+         f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )
+         )
                                     {
                                         fail = fail && ( test != 0.0f);
                                         if( ! fail )
@@ -900,224 +1134,147 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
                                     }
                                 }
                             }
-                            else if( IsDoubleSubnormal( s3[j] ) )
-                            {
-                                correct2 = f->dfunc.f_fff( 0.0, s2[j], 0.0 );
-                                correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 );
-                                long double correct4 = f->dfunc.f_fff( 0.0,  s2[j], -0.0 );
-                                long double correct5 = f->dfunc.f_fff( -0.0, s2[j], -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
 
-                                // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                    IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
-                                }
+                            if( fabsf(err ) > maxError )
+                            {
+                                maxError = fabsf(err);
+                                maxErrorVal = s[j];
+                                maxErrorVal2 = s2[j];
+                                maxErrorVal3 = s3[j];
+                            }
+
+                            if( fail )
+                            {
+                                vlog_error( "\nERROR: %sD%s: %f ulp error at
+         {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j],
+         s3[j], ((double*) gOut_Ref)[j], test ); error = -1; goto exit;
                             }
                         }
-                        else if( fail && IsDoubleSubnormal( s2[j] ) )
-                        {
-                            long double correct2 = f->dfunc.f_fff( s[j], 0.0, s3[j] );
-                            long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps )  || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
-                            {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
-
-                            //try with second two args as zero
-                            if( IsDoubleSubnormal( s3[j] ) )
-                            {
-                                correct2 = f->dfunc.f_fff( s[j], 0.0, 0.0 );
-                                correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 );
-                                long double correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 );
-                                long double correct5 = f->dfunc.f_fff( s[j], -0.0, -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
-
-                                // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                    IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
-                                {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
-                                }
-                            }
-                        }
-                        else if( fail && IsDoubleSubnormal(s3[j]) )
-                        {
-                            long double correct2 = f->dfunc.f_fff( s[j], s2[j], 0.0 );
-                            long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
-
-                            // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
-                            {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
-                            }
-                        }
-                    }
-
-                    if( fabsf(err ) > maxError )
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                        maxErrorVal2 = s2[j];
-                        maxErrorVal3 = s3[j];
-                    }
-
-                    if( fail )
-                    {
-                        vlog_error( "\nERROR: %sD%s: %f ulp error at {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((double*) gOut_Ref)[j], test );
- error = -1;
- goto exit;
                     }
                 }
-            }
-        }
-*/
-        if( 0 == (i & 0x0fffffff) )
+        */
+        if (0 == (i & 0x0fffffff))
         {
-            vlog("." );
+            vlog(".");
             fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "pass" );
+            vlog("pass");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -1125,6 +1282,3 @@ exit:
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index d7f2ebf6..ca58f2e5 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -27,116 +27,122 @@
 #include "harness/parseParameters.h"
 #include "harness/typeWrappers.h"
 
-#if defined( __APPLE__ )
-    #include <sys/sysctl.h>
-    #include <sys/mman.h>
-    #include <libgen.h>
-    #include <sys/time.h>
-#elif defined( __linux__ )
-    #include <unistd.h>
-    #include <sys/syscall.h>
-    #include <linux/sysctl.h>
-    #include <sys/param.h>
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/mman.h>
+#include <libgen.h>
+#include <sys/time.h>
+#elif defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/sysctl.h>
+#include <sys/param.h>
 #endif
 
-#if defined (__linux__) || (defined WIN32 && defined __MINGW32__)
+#if defined(__linux__) || (defined WIN32 && defined __MINGW32__)
 #include <sys/param.h>
 #endif
 
 #include "harness/testHarness.h"
 
-#define kPageSize           4096
-#define DOUBLE_REQUIRED_FEATURES    ( CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM  )
+#define kPageSize 4096
+#define DOUBLE_REQUIRED_FEATURES                                               \
+    (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO                  \
+     | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)
 
-const char      **gTestNames = NULL;
-unsigned int    gTestNameCount = 0;
-char            appName[ MAXPATHLEN ] = "";
-cl_device_id    gDevice = NULL;
-cl_context      gContext = NULL;
+const char **gTestNames = NULL;
+unsigned int gTestNameCount = 0;
+char appName[MAXPATHLEN] = "";
+cl_device_id gDevice = NULL;
+cl_context gContext = NULL;
 cl_command_queue gQueue = NULL;
-static int32_t  gStartTestNumber;
-static int32_t  gEndTestNumber;
-int             gSkipCorrectnessTesting = 0;
-int             gStopOnError = 0;
-static bool     gSkipRestOfTests;
-#if defined( __APPLE__ )
-int             gMeasureTimes = 1;
+static int32_t gStartTestNumber;
+static int32_t gEndTestNumber;
+int gSkipCorrectnessTesting = 0;
+int gStopOnError = 0;
+static bool gSkipRestOfTests;
+#if defined(__APPLE__)
+int gMeasureTimes = 1;
 #else
-int             gMeasureTimes = 0;
+int gMeasureTimes = 0;
 #endif
-int             gReportAverageTimes = 0;
-int             gForceFTZ = 0;
-int             gWimpyMode = 0;
-int             gHasDouble = 0;
-int             gTestFloat = 1;
+int gReportAverageTimes = 0;
+int gForceFTZ = 0;
+int gWimpyMode = 0;
+int gHasDouble = 0;
+int gTestFloat = 1;
 // This flag should be 'ON' by default and it can be changed through the command
 // line arguments.
 static int gTestFastRelaxed = 1;
-/*This flag corresponds to defining if the implementation has Derived Fast Relaxed functions.
-  The spec does not specify ULP for derived function.  The derived functions are composed of base functions which are tested for ULP, thus when this flag is enabled,
-  Derived functions will not be tested for ULP, as per table 7.1 of OpenCL 2.0 spec.
-  Since there is no way of quering the device whether it is a derived or non-derived implementation according to OpenCL 2.0 spec then it has to be changed through a command line argument.
+/*This flag corresponds to defining if the implementation has Derived Fast
+  Relaxed functions. The spec does not specify ULP for derived function.  The
+  derived functions are composed of base functions which are tested for ULP,
+  thus when this flag is enabled, Derived functions will not be tested for ULP,
+  as per table 7.1 of OpenCL 2.0 spec. Since there is no way of quering the
+  device whether it is a derived or non-derived implementation according to
+  OpenCL 2.0 spec then it has to be changed through a command line argument.
 */
-int             gFastRelaxedDerived = 1;
-int             gToggleCorrectlyRoundedDivideSqrt = 0;
-int             gDeviceILogb0 = 1;
-int             gDeviceILogbNaN = 1;
-int             gCheckTininessBeforeRounding = 1;
-int             gIsInRTZMode = 0;
-uint32_t        gMaxVectorSizeIndex = VECTOR_SIZE_COUNT;
-uint32_t        gMinVectorSizeIndex = 0;
-const char      *method[] = { "Best", "Average" };
-void            *gIn = NULL;
-void            *gIn2 = NULL;
-void            *gIn3 = NULL;
-void            *gOut_Ref = NULL;
-void            *gOut[VECTOR_SIZE_COUNT] = {NULL, NULL, NULL, NULL, NULL, NULL };
-void            *gOut_Ref2 = NULL;
-void            *gOut2[VECTOR_SIZE_COUNT] = {NULL, NULL, NULL, NULL, NULL, NULL };
-cl_mem          gInBuffer = NULL;
-cl_mem          gInBuffer2 = NULL;
-cl_mem          gInBuffer3 = NULL;
-cl_mem          gOutBuffer[VECTOR_SIZE_COUNT]= {NULL, NULL, NULL, NULL, NULL, NULL };
-cl_mem          gOutBuffer2[VECTOR_SIZE_COUNT]= {NULL, NULL, NULL, NULL, NULL, NULL };
-uint32_t        gComputeDevices = 0;
-uint32_t        gSimdSize = 1;
-uint32_t        gDeviceFrequency = 0;
-static MTdata   gMTdata;
+int gFastRelaxedDerived = 1;
+int gToggleCorrectlyRoundedDivideSqrt = 0;
+int gDeviceILogb0 = 1;
+int gDeviceILogbNaN = 1;
+int gCheckTininessBeforeRounding = 1;
+int gIsInRTZMode = 0;
+uint32_t gMaxVectorSizeIndex = VECTOR_SIZE_COUNT;
+uint32_t gMinVectorSizeIndex = 0;
+const char *method[] = { "Best", "Average" };
+void *gIn = NULL;
+void *gIn2 = NULL;
+void *gIn3 = NULL;
+void *gOut_Ref = NULL;
+void *gOut[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+void *gOut_Ref2 = NULL;
+void *gOut2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+cl_mem gInBuffer = NULL;
+cl_mem gInBuffer2 = NULL;
+cl_mem gInBuffer3 = NULL;
+cl_mem gOutBuffer[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+cl_mem gOutBuffer2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL };
+uint32_t gComputeDevices = 0;
+uint32_t gSimdSize = 1;
+uint32_t gDeviceFrequency = 0;
+static MTdata gMTdata;
 cl_device_fp_config gFloatCapabilities = 0;
 cl_device_fp_config gDoubleCapabilities = 0;
-int             gWimpyReductionFactor = 32;
-int             gWimpyBufferSize = BUFFER_SIZE;
-int             gVerboseBruteForce = 0;
+int gWimpyReductionFactor = 32;
+int gWimpyBufferSize = BUFFER_SIZE;
+int gVerboseBruteForce = 0;
 
-static int ParseArgs( int argc, const char **argv );
-static void PrintUsage( void );
-static void PrintFunctions( void );
-test_status InitCL( cl_device_id device );
-static void ReleaseCL( void );
-static int InitILogbConstants( void );
-static int IsTininessDetectedBeforeRounding( void );
-static int IsInRTZMode( void );         //expensive. Please check gIsInRTZMode global instead.
+static int ParseArgs(int argc, const char **argv);
+static void PrintUsage(void);
+static void PrintFunctions(void);
+test_status InitCL(cl_device_id device);
+static void ReleaseCL(void);
+static int InitILogbConstants(void);
+static int IsTininessDetectedBeforeRounding(void);
+static int
+IsInRTZMode(void); // expensive. Please check gIsInRTZMode global instead.
 
 
-int doTest( const char* name )
+int doTest(const char *name)
 {
-    if( gSkipRestOfTests )
+    if (gSkipRestOfTests)
     {
-        vlog( "Skipping function because of an earlier error.\n" );
+        vlog("Skipping function because of an earlier error.\n");
         return 1;
     }
 
     int error = 0;
-    const Func* func_data = NULL;
+    const Func *func_data = NULL;
 
-    for( size_t i = 0; i < functionListCount; i++ )
+    for (size_t i = 0; i < functionListCount; i++)
     {
-        const Func* const temp_func = functionList + i;
-        if( strcmp( temp_func->name, name ) == 0 )
+        const Func *const temp_func = functionList + i;
+        if (strcmp(temp_func->name, name) == 0)
         {
-            if( i < gStartTestNumber || i > gEndTestNumber )
+            if (i < gStartTestNumber || i > gEndTestNumber)
             {
-                vlog( "Skipping function #%d\n", i );
+                vlog("Skipping function #%d\n", i);
                 return 0;
             }
 
@@ -145,32 +151,35 @@ int doTest( const char* name )
         }
     }
 
-    if( func_data == NULL )
+    if (func_data == NULL)
     {
-        vlog( "Function '%s' doesn't exist!\n", name );
-        exit( EXIT_FAILURE );
+        vlog("Function '%s' doesn't exist!\n", name);
+        exit(EXIT_FAILURE);
     }
 
-    if( func_data->func.p == NULL )
+    if (func_data->func.p == NULL)
     {
-        vlog( "'%s' is missing implementation, skipping function.\n", func_data->name );
+        vlog("'%s' is missing implementation, skipping function.\n",
+             func_data->name);
         return 0;
     }
 
     // if correctly rounded divide & sqrt are supported by the implementation
     // then test it; otherwise skip the test
-    if( strcmp( func_data->name, "sqrt_cr" ) == 0 || strcmp( func_data->name, "divide_cr" ) == 0 )
+    if (strcmp(func_data->name, "sqrt_cr") == 0
+        || strcmp(func_data->name, "divide_cr") == 0)
     {
-        if( ( gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT ) == 0 )
+        if ((gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT) == 0)
         {
-            vlog( "Correctly rounded divide and sqrt are not supported, skipping function.\n" );
+            vlog("Correctly rounded divide and sqrt are not supported, "
+                 "skipping function.\n");
             return 0;
         }
     }
 
     {
         extern int my_ilogb(double);
-        if( 0 == strcmp( "ilogb", func_data->name ) )
+        if (0 == strcmp("ilogb", func_data->name))
         {
             InitILogbConstants();
         }
@@ -201,17 +210,17 @@ int doTest( const char* name )
             }
         }
 
-        if( gTestFloat )
+        if (gTestFloat)
         {
             gTestCount++;
-            vlog( "%3d: ", gTestCount );
+            vlog("%3d: ", gTestCount);
             // Don't test with relaxed requirements.
             if (func_data->vtbl_ptr->TestFunc(func_data, gMTdata,
                                               false /* relaxed mode */))
             {
                 gFailCount++;
                 error++;
-                if( gStopOnError )
+                if (gStopOnError)
                 {
                     gSkipRestOfTests = true;
                     return error;
@@ -219,17 +228,18 @@ int doTest( const char* name )
             }
         }
 
-        if( gHasDouble && NULL != func_data->vtbl_ptr->DoubleTestFunc && NULL != func_data->dfunc.p )
+        if (gHasDouble && NULL != func_data->vtbl_ptr->DoubleTestFunc
+            && NULL != func_data->dfunc.p)
         {
             gTestCount++;
-            vlog( "%3d: ", gTestCount );
+            vlog("%3d: ", gTestCount);
             // Don't test with relaxed requirements.
             if (func_data->vtbl_ptr->DoubleTestFunc(func_data, gMTdata,
                                                     false /* relaxed mode*/))
             {
                 gFailCount++;
                 error++;
-                if( gStopOnError )
+                if (gStopOnError)
                 {
                     gSkipRestOfTests = true;
                     return error;
@@ -241,515 +251,549 @@ int doTest( const char* name )
     return error;
 }
 
-int test_acos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_acos(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "acos" );
+    return doTest("acos");
 }
-int test_acosh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_acosh(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "acosh" );
+    return doTest("acosh");
 }
-int test_acospi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_acospi(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "acospi" );
+    return doTest("acospi");
 }
-int test_asin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_asin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "asin" );
+    return doTest("asin");
 }
-int test_asinh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_asinh(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "asinh" );
+    return doTest("asinh");
 }
-int test_asinpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_asinpi(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "asinpi" );
+    return doTest("asinpi");
 }
-int test_atan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "atan" );
+    return doTest("atan");
 }
-int test_atanh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atanh(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "atanh" );
+    return doTest("atanh");
 }
-int test_atanpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atanpi(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "atanpi" );
+    return doTest("atanpi");
 }
-int test_atan2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atan2(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "atan2" );
+    return doTest("atan2");
 }
-int test_atan2pi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_atan2pi(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "atan2pi" );
+    return doTest("atan2pi");
 }
-int test_cbrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cbrt(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "cbrt" );
+    return doTest("cbrt");
 }
-int test_ceil( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_ceil(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "ceil" );
+    return doTest("ceil");
 }
-int test_copysign( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_copysign(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "copysign" );
+    return doTest("copysign");
 }
-int test_cos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cos(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "cos" );
+    return doTest("cos");
 }
-int test_cosh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cosh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "cosh" );
+    return doTest("cosh");
 }
-int test_cospi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_cospi(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "cospi" );
+    return doTest("cospi");
 }
-int test_exp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_exp(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "exp" );
+    return doTest("exp");
 }
-int test_exp2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_exp2(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "exp2" );
+    return doTest("exp2");
 }
-int test_exp10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_exp10(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "exp10" );
+    return doTest("exp10");
 }
-int test_expm1( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_expm1(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "expm1" );
+    return doTest("expm1");
 }
-int test_fabs( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fabs(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fabs" );
+    return doTest("fabs");
 }
-int test_fdim( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fdim(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fdim" );
+    return doTest("fdim");
 }
-int test_floor( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_floor(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "floor" );
+    return doTest("floor");
 }
-int test_fma( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fma(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "fma" );
+    return doTest("fma");
 }
-int test_fmax( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fmax(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fmax" );
+    return doTest("fmax");
 }
-int test_fmin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fmin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fmin" );
+    return doTest("fmin");
 }
-int test_fmod( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fmod(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "fmod" );
+    return doTest("fmod");
 }
-int test_fract( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_fract(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "fract" );
+    return doTest("fract");
 }
-int test_frexp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_frexp(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "frexp" );
+    return doTest("frexp");
 }
-int test_hypot( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_hypot(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "hypot" );
+    return doTest("hypot");
 }
-int test_ilogb( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_ilogb(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "ilogb" );
+    return doTest("ilogb");
 }
-int test_isequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isequal(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "isequal" );
+    return doTest("isequal");
 }
-int test_isfinite( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isfinite(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "isfinite" );
+    return doTest("isfinite");
 }
-int test_isgreater( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isgreater(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "isgreater" );
+    return doTest("isgreater");
 }
-int test_isgreaterequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isgreaterequal(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
 {
-    return doTest( "isgreaterequal" );
+    return doTest("isgreaterequal");
 }
-int test_isinf( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isinf(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "isinf" );
+    return doTest("isinf");
 }
-int test_isless( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isless(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "isless" );
+    return doTest("isless");
 }
-int test_islessequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_islessequal(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
-    return doTest( "islessequal" );
+    return doTest("islessequal");
 }
-int test_islessgreater( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_islessgreater(cl_device_id deviceID, cl_context context,
+                       cl_command_queue queue, int num_elements)
 {
-    return doTest( "islessgreater" );
+    return doTest("islessgreater");
 }
-int test_isnan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isnan(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "isnan" );
+    return doTest("isnan");
 }
-int test_isnormal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isnormal(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "isnormal" );
+    return doTest("isnormal");
 }
-int test_isnotequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isnotequal(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "isnotequal" );
+    return doTest("isnotequal");
 }
-int test_isordered( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isordered(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "isordered" );
+    return doTest("isordered");
 }
-int test_isunordered( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_isunordered(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
-    return doTest( "isunordered" );
+    return doTest("isunordered");
 }
-int test_ldexp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_ldexp(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "ldexp" );
+    return doTest("ldexp");
 }
-int test_lgamma( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_lgamma(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "lgamma" );
+    return doTest("lgamma");
 }
-int test_lgamma_r( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_lgamma_r(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "lgamma_r" );
+    return doTest("lgamma_r");
 }
-int test_log( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "log" );
+    return doTest("log");
 }
-int test_log2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log2(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "log2" );
+    return doTest("log2");
 }
-int test_log10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log10(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "log10" );
+    return doTest("log10");
 }
-int test_log1p( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_log1p(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "log1p" );
+    return doTest("log1p");
 }
-int test_logb( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_logb(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "logb" );
+    return doTest("logb");
 }
-int test_mad( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_mad(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "mad" );
+    return doTest("mad");
 }
-int test_maxmag( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_maxmag(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "maxmag" );
+    return doTest("maxmag");
 }
-int test_minmag( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_minmag(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "minmag" );
+    return doTest("minmag");
 }
-int test_modf( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_modf(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "modf" );
+    return doTest("modf");
 }
-int test_nan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_nan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "nan" );
+    return doTest("nan");
 }
-int test_nextafter( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_nextafter(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "nextafter" );
+    return doTest("nextafter");
 }
-int test_pow( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_pow(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "pow" );
+    return doTest("pow");
 }
-int test_pown( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_pown(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "pown" );
+    return doTest("pown");
 }
-int test_powr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_powr(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "powr" );
+    return doTest("powr");
 }
-int test_remainder( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_remainder(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "remainder" );
+    return doTest("remainder");
 }
-int test_remquo( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_remquo(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "remquo" );
+    return doTest("remquo");
 }
-int test_rint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_rint(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "rint" );
+    return doTest("rint");
 }
-int test_rootn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_rootn(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "rootn" );
+    return doTest("rootn");
 }
-int test_round( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_round(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "round" );
+    return doTest("round");
 }
-int test_rsqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_rsqrt(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "rsqrt" );
+    return doTest("rsqrt");
 }
-int test_signbit( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_signbit(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "signbit" );
+    return doTest("signbit");
 }
-int test_sin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sin(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "sin" );
+    return doTest("sin");
 }
-int test_sincos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sincos(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "sincos" );
+    return doTest("sincos");
 }
-int test_sinh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sinh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "sinh" );
+    return doTest("sinh");
 }
-int test_sinpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sinpi(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "sinpi" );
+    return doTest("sinpi");
 }
-int test_sqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sqrt(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "sqrt" );
+    return doTest("sqrt");
 }
-int test_sqrt_cr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_sqrt_cr(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
 {
-    return doTest( "sqrt_cr" );
+    return doTest("sqrt_cr");
 }
-int test_tan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_tan(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "tan" );
+    return doTest("tan");
 }
-int test_tanh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_tanh(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+              int num_elements)
 {
-    return doTest( "tanh" );
+    return doTest("tanh");
 }
-int test_tanpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_tanpi(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "tanpi" );
+    return doTest("tanpi");
 }
-int test_trunc( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_trunc(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
 {
-    return doTest( "trunc" );
+    return doTest("trunc");
 }
-int test_half_cos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_cos(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_cos" );
+    return doTest("half_cos");
 }
-int test_half_divide( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_divide(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_divide" );
+    return doTest("half_divide");
 }
-int test_half_exp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_exp(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_exp" );
+    return doTest("half_exp");
 }
-int test_half_exp2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_exp2(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_exp2" );
+    return doTest("half_exp2");
 }
-int test_half_exp10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_exp10(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_exp10" );
+    return doTest("half_exp10");
 }
-int test_half_log( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_log(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_log" );
+    return doTest("half_log");
 }
-int test_half_log2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_log2(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_log2" );
+    return doTest("half_log2");
 }
-int test_half_log10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_log10(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_log10" );
+    return doTest("half_log10");
 }
-int test_half_powr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_powr(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_powr" );
+    return doTest("half_powr");
 }
-int test_half_recip( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_recip(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_recip" );
+    return doTest("half_recip");
 }
-int test_half_rsqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_rsqrt(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_rsqrt" );
+    return doTest("half_rsqrt");
 }
-int test_half_sin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_sin(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_sin" );
+    return doTest("half_sin");
 }
-int test_half_sqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_sqrt(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_sqrt" );
+    return doTest("half_sqrt");
 }
-int test_half_tan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_half_tan(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "half_tan" );
+    return doTest("half_tan");
 }
-int test_add( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_add(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "add" );
+    return doTest("add");
 }
-int test_subtract( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_subtract(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "subtract" );
+    return doTest("subtract");
 }
-int test_divide( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_divide(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, int num_elements)
 {
-    return doTest( "divide" );
+    return doTest("divide");
 }
-int test_divide_cr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_divide_cr(cl_device_id deviceID, cl_context context,
+                   cl_command_queue queue, int num_elements)
 {
-    return doTest( "divide_cr" );
+    return doTest("divide_cr");
 }
-int test_multiply( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_multiply(cl_device_id deviceID, cl_context context,
+                  cl_command_queue queue, int num_elements)
 {
-    return doTest( "multiply" );
+    return doTest("multiply");
 }
-int test_assignment( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_assignment(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, int num_elements)
 {
-    return doTest( "assignment" );
+    return doTest("assignment");
 }
-int test_not( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_not(cl_device_id deviceID, cl_context context, cl_command_queue queue,
+             int num_elements)
 {
-    return doTest( "not" );
+    return doTest("not");
 }
 
 test_definition test_list[] = {
-    ADD_TEST( acos ),
-    ADD_TEST( acosh ),
-    ADD_TEST( acospi ),
-    ADD_TEST( asin ),
-    ADD_TEST( asinh ),
-    ADD_TEST( asinpi ),
-    ADD_TEST( atan ),
-    ADD_TEST( atanh ),
-    ADD_TEST( atanpi ),
-    ADD_TEST( atan2 ),
-    ADD_TEST( atan2pi ),
-    ADD_TEST( cbrt ),
-    ADD_TEST( ceil ),
-    ADD_TEST( copysign ),
-    ADD_TEST( cos ),
-    ADD_TEST( cosh ),
-    ADD_TEST( cospi ),
-    ADD_TEST( exp ),
-    ADD_TEST( exp2 ),
-    ADD_TEST( exp10 ),
-    ADD_TEST( expm1 ),
-    ADD_TEST( fabs ),
-    ADD_TEST( fdim ),
-    ADD_TEST( floor ),
-    ADD_TEST( fma ),
-    ADD_TEST( fmax ),
-    ADD_TEST( fmin ),
-    ADD_TEST( fmod ),
-    ADD_TEST( fract ),
-    ADD_TEST( frexp ),
-    ADD_TEST( hypot ),
-    ADD_TEST( ilogb ),
-    ADD_TEST( isequal ),
-    ADD_TEST( isfinite ),
-    ADD_TEST( isgreater ),
-    ADD_TEST( isgreaterequal ),
-    ADD_TEST( isinf ),
-    ADD_TEST( isless ),
-    ADD_TEST( islessequal ),
-    ADD_TEST( islessgreater ),
-    ADD_TEST( isnan ),
-    ADD_TEST( isnormal ),
-    ADD_TEST( isnotequal ),
-    ADD_TEST( isordered ),
-    ADD_TEST( isunordered ),
-    ADD_TEST( ldexp ),
-    ADD_TEST( lgamma ),
-    ADD_TEST( lgamma_r ),
-    ADD_TEST( log ),
-    ADD_TEST( log2 ),
-    ADD_TEST( log10 ),
-    ADD_TEST( log1p ),
-    ADD_TEST( logb ),
-    ADD_TEST( mad ),
-    ADD_TEST( maxmag ),
-    ADD_TEST( minmag ),
-    ADD_TEST( modf ),
-    ADD_TEST( nan ),
-    ADD_TEST( nextafter ),
-    ADD_TEST( pow ),
-    ADD_TEST( pown ),
-    ADD_TEST( powr ),
-    ADD_TEST( remainder ),
-    ADD_TEST( remquo ),
-    ADD_TEST( rint ),
-    ADD_TEST( rootn ),
-    ADD_TEST( round ),
-    ADD_TEST( rsqrt ),
-    ADD_TEST( signbit ),
-    ADD_TEST( sin ),
-    ADD_TEST( sincos ),
-    ADD_TEST( sinh ),
-    ADD_TEST( sinpi ),
-    ADD_TEST( sqrt ),
-    ADD_TEST( sqrt_cr ),
-    ADD_TEST( tan ),
-    ADD_TEST( tanh ),
-    ADD_TEST( tanpi ),
-    ADD_TEST( trunc ),
-    ADD_TEST( half_cos ),
-    ADD_TEST( half_divide ),
-    ADD_TEST( half_exp ),
-    ADD_TEST( half_exp2 ),
-    ADD_TEST( half_exp10 ),
-    ADD_TEST( half_log ),
-    ADD_TEST( half_log2 ),
-    ADD_TEST( half_log10 ),
-    ADD_TEST( half_powr ),
-    ADD_TEST( half_recip ),
-    ADD_TEST( half_rsqrt ),
-    ADD_TEST( half_sin ),
-    ADD_TEST( half_sqrt ),
-    ADD_TEST( half_tan ),
-    ADD_TEST( add ),
-    ADD_TEST( subtract ),
-    ADD_TEST( divide ),
-    ADD_TEST( divide_cr ),
-    ADD_TEST( multiply ),
-    ADD_TEST( assignment ),
-    ADD_TEST( not ),
+    ADD_TEST(acos),          ADD_TEST(acosh),      ADD_TEST(acospi),
+    ADD_TEST(asin),          ADD_TEST(asinh),      ADD_TEST(asinpi),
+    ADD_TEST(atan),          ADD_TEST(atanh),      ADD_TEST(atanpi),
+    ADD_TEST(atan2),         ADD_TEST(atan2pi),    ADD_TEST(cbrt),
+    ADD_TEST(ceil),          ADD_TEST(copysign),   ADD_TEST(cos),
+    ADD_TEST(cosh),          ADD_TEST(cospi),      ADD_TEST(exp),
+    ADD_TEST(exp2),          ADD_TEST(exp10),      ADD_TEST(expm1),
+    ADD_TEST(fabs),          ADD_TEST(fdim),       ADD_TEST(floor),
+    ADD_TEST(fma),           ADD_TEST(fmax),       ADD_TEST(fmin),
+    ADD_TEST(fmod),          ADD_TEST(fract),      ADD_TEST(frexp),
+    ADD_TEST(hypot),         ADD_TEST(ilogb),      ADD_TEST(isequal),
+    ADD_TEST(isfinite),      ADD_TEST(isgreater),  ADD_TEST(isgreaterequal),
+    ADD_TEST(isinf),         ADD_TEST(isless),     ADD_TEST(islessequal),
+    ADD_TEST(islessgreater), ADD_TEST(isnan),      ADD_TEST(isnormal),
+    ADD_TEST(isnotequal),    ADD_TEST(isordered),  ADD_TEST(isunordered),
+    ADD_TEST(ldexp),         ADD_TEST(lgamma),     ADD_TEST(lgamma_r),
+    ADD_TEST(log),           ADD_TEST(log2),       ADD_TEST(log10),
+    ADD_TEST(log1p),         ADD_TEST(logb),       ADD_TEST(mad),
+    ADD_TEST(maxmag),        ADD_TEST(minmag),     ADD_TEST(modf),
+    ADD_TEST(nan),           ADD_TEST(nextafter),  ADD_TEST(pow),
+    ADD_TEST(pown),          ADD_TEST(powr),       ADD_TEST(remainder),
+    ADD_TEST(remquo),        ADD_TEST(rint),       ADD_TEST(rootn),
+    ADD_TEST(round),         ADD_TEST(rsqrt),      ADD_TEST(signbit),
+    ADD_TEST(sin),           ADD_TEST(sincos),     ADD_TEST(sinh),
+    ADD_TEST(sinpi),         ADD_TEST(sqrt),       ADD_TEST(sqrt_cr),
+    ADD_TEST(tan),           ADD_TEST(tanh),       ADD_TEST(tanpi),
+    ADD_TEST(trunc),         ADD_TEST(half_cos),   ADD_TEST(half_divide),
+    ADD_TEST(half_exp),      ADD_TEST(half_exp2),  ADD_TEST(half_exp10),
+    ADD_TEST(half_log),      ADD_TEST(half_log2),  ADD_TEST(half_log10),
+    ADD_TEST(half_powr),     ADD_TEST(half_recip), ADD_TEST(half_rsqrt),
+    ADD_TEST(half_sin),      ADD_TEST(half_sqrt),  ADD_TEST(half_tan),
+    ADD_TEST(add),           ADD_TEST(subtract),   ADD_TEST(divide),
+    ADD_TEST(divide_cr),     ADD_TEST(multiply),   ADD_TEST(assignment),
+    ADD_TEST(not),
 };
 
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 
 #pragma mark -
 
-int main (int argc, const char * argv[])
+int main(int argc, const char *argv[])
 {
     int error;
 
@@ -759,60 +803,59 @@ int main (int argc, const char * argv[])
         return -1;
     }
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     struct timeval startTime;
-    gettimeofday( &startTime, NULL );
+    gettimeofday(&startTime, NULL);
 #endif
 
-    error = ParseArgs( argc, argv );
-    if( error )
-        return error;
+    error = ParseArgs(argc, argv);
+    if (error) return error;
 
     // This takes a while, so prevent the machine from going to sleep.
     PreventSleep();
-    atexit( ResumeSleep );
+    atexit(ResumeSleep);
 
-    if( gSkipCorrectnessTesting )
-        vlog( "*** Skipping correctness testing! ***\n\n" );
-    else if( gStopOnError )
-        vlog( "Stopping at first error.\n" );
+    if (gSkipCorrectnessTesting)
+        vlog("*** Skipping correctness testing! ***\n\n");
+    else if (gStopOnError)
+        vlog("Stopping at first error.\n");
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        vlog( "%s times are reported at right (cycles per element):\n", method[gReportAverageTimes] );
-        vlog( "\n" );
-        if( gSkipCorrectnessTesting )
-            vlog( "   \t               ");
+        vlog("%s times are reported at right (cycles per element):\n",
+             method[gReportAverageTimes]);
+        vlog("\n");
+        if (gSkipCorrectnessTesting)
+            vlog("   \t               ");
         else
-            vlog( "   \t                                        ");
-        if( gWimpyMode )
-            vlog( "   " );
-        for( int i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-            vlog( "\t  float%s", sizeNames[i] );
+            vlog("   \t                                        ");
+        if (gWimpyMode) vlog("   ");
+        for (int i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+            vlog("\t  float%s", sizeNames[i]);
     }
     else
     {
-        vlog( "   \t                                        ");
-        if( gWimpyMode )
-            vlog( "   " );
+        vlog("   \t                                        ");
+        if (gWimpyMode) vlog("   ");
     }
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t  max_ulps" );
+    if (!gSkipCorrectnessTesting) vlog("\t  max_ulps");
 
-    vlog( "\n-----------------------------------------------------------------------------------------------------------\n" );
+    vlog("\n-------------------------------------------------------------------"
+         "----------------------------------------\n");
 
-    gMTdata = init_genrand( gRandomSeed );
-    if( gEndTestNumber == 0 )
+    gMTdata = init_genrand(gRandomSeed);
+    if (gEndTestNumber == 0)
     {
         gEndTestNumber = functionListCount;
     }
 
     FPU_mode_type oldMode;
-    DisableFTZ( &oldMode );
+    DisableFTZ(&oldMode);
 
-    int ret = runTestHarnessWithCheck( gTestNameCount, gTestNames, test_num, test_list, true, 0, InitCL );
+    int ret = runTestHarnessWithCheck(gTestNameCount, gTestNames, test_num,
+                                      test_list, true, 0, InitCL);
 
-    RestoreFPState( &oldMode );
+    RestoreFPState(&oldMode);
 
     free_mtdata(gMTdata);
     free(gTestNames);
@@ -825,24 +868,24 @@ int main (int argc, const char * argv[])
 
     ReleaseCL();
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     struct timeval endTime;
-    gettimeofday( &endTime, NULL );
-    double time = (double) endTime.tv_sec - (double) startTime.tv_sec;
-    time += 1e-6 * ((double) endTime.tv_usec - (double) startTime.tv_usec);
-    vlog( "time: %f s\n", time );
+    gettimeofday(&endTime, NULL);
+    double time = (double)endTime.tv_sec - (double)startTime.tv_sec;
+    time += 1e-6 * ((double)endTime.tv_usec - (double)startTime.tv_usec);
+    vlog("time: %f s\n", time);
 #endif
 
     return ret;
 }
 
-static int ParseArgs( int argc, const char **argv )
+static int ParseArgs(int argc, const char **argv)
 {
     int i;
-    gTestNames = (const char**) calloc( argc - 1, sizeof( char*) );
-    if( NULL == gTestNames )
+    gTestNames = (const char **)calloc(argc - 1, sizeof(char *));
+    if (NULL == gTestNames)
     {
-        vlog( "Failed to allocate memory for gTestNames array.\n" );
+        vlog("Failed to allocate memory for gTestNames array.\n");
         return 1;
     }
     gTestNames[0] = argv[0];
@@ -850,91 +893,64 @@ static int ParseArgs( int argc, const char **argv )
     int singleThreaded = 0;
 
     { // Extract the app name
-        strncpy( appName, argv[0], MAXPATHLEN );
+        strncpy(appName, argv[0], MAXPATHLEN);
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
         char baseName[MAXPATHLEN];
         char *base = NULL;
-        strncpy( baseName, argv[0], MAXPATHLEN );
-        base = basename( baseName );
-        if( NULL != base )
+        strncpy(baseName, argv[0], MAXPATHLEN);
+        base = basename(baseName);
+        if (NULL != base)
         {
-            strncpy( appName, base, sizeof( appName )  );
-            appName[ sizeof( appName ) -1 ] = '\0';
+            strncpy(appName, base, sizeof(appName));
+            appName[sizeof(appName) - 1] = '\0';
         }
 #endif
     }
 
-    vlog( "\n%s\t", appName );
-    for( i = 1; i < argc; i++ )
+    vlog("\n%s\t", appName);
+    for (i = 1; i < argc; i++)
     {
         const char *arg = argv[i];
-        if( NULL == arg )
-            break;
+        if (NULL == arg) break;
 
-        vlog( "\t%s", arg );
+        vlog("\t%s", arg);
         int optionFound = 0;
-        if( arg[0] == '-' )
+        if (arg[0] == '-')
         {
-            while( arg[1] != '\0' )
+            while (arg[1] != '\0')
             {
                 arg++;
                 optionFound = 1;
-                switch( *arg )
+                switch (*arg)
                 {
-                    case 'a':
-                        gReportAverageTimes ^= 1;
-                        break;
+                    case 'a': gReportAverageTimes ^= 1; break;
 
-                    case 'c':
-                        gToggleCorrectlyRoundedDivideSqrt ^= 1;
-                        break;
+                    case 'c': gToggleCorrectlyRoundedDivideSqrt ^= 1; break;
 
-                    case 'd':
-                        gHasDouble ^= 1;
-                        break;
+                    case 'd': gHasDouble ^= 1; break;
 
-                    case 'e':
-                        gFastRelaxedDerived ^= 1;
-                        break;
+                    case 'e': gFastRelaxedDerived ^= 1; break;
 
-                    case 'f':
-                        gTestFloat ^= 1;
-                        break;
+                    case 'f': gTestFloat ^= 1; break;
 
-                    case 'h':
-                        PrintUsage();
-                        return -1;
+                    case 'h': PrintUsage(); return -1;
 
-                    case 'p':
-                      PrintFunctions();
-                      return -1;
+                    case 'p': PrintFunctions(); return -1;
 
-                    case 'l':
-                        gSkipCorrectnessTesting ^= 1;
-                        break;
+                    case 'l': gSkipCorrectnessTesting ^= 1; break;
 
-                    case 'm':
-                        singleThreaded ^= 1;
-                        break;
+                    case 'm': singleThreaded ^= 1; break;
 
-                    case 'r':
-                        gTestFastRelaxed ^= 1;
-                        break;
+                    case 'r': gTestFastRelaxed ^= 1; break;
 
-                    case 's':
-                        gStopOnError ^= 1;
-                        break;
+                    case 's': gStopOnError ^= 1; break;
 
-                    case 't':
-                        gMeasureTimes ^= 1;
-                        break;
+                    case 't': gMeasureTimes ^= 1; break;
 
-                    case 'v':
-                        gVerboseBruteForce ^= 1;
-                        break;
+                    case 'v': gVerboseBruteForce ^= 1; break;
 
-                    case 'w':   // wimpy mode
+                    case 'w': // wimpy mode
                         gWimpyMode ^= 1;
                         break;
 
@@ -942,12 +958,10 @@ static int ParseArgs( int argc, const char **argv )
                         parseWimpyReductionFactor(arg, gWimpyReductionFactor);
                         break;
 
-                    case 'z':
-                        gForceFTZ ^= 1;
-                        break;
+                    case 'z': gForceFTZ ^= 1; break;
 
                     case '1':
-                        if( arg[1] == '6' )
+                        if (arg[1] == '6')
                         {
                             gMinVectorSizeIndex = 5;
                             gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
@@ -960,52 +974,52 @@ static int ParseArgs( int argc, const char **argv )
                         }
                         break;
                     case '2':
-                            gMinVectorSizeIndex = 1;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 1;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                     case '3':
-                            gMinVectorSizeIndex = 2;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 2;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                     case '4':
-                            gMinVectorSizeIndex = 3;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 3;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                     case '8':
-                            gMinVectorSizeIndex = 4;
-                            gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
-                            break;
+                        gMinVectorSizeIndex = 4;
+                        gMaxVectorSizeIndex = gMinVectorSizeIndex + 1;
+                        break;
                         break;
 
                     default:
-                        vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
+                        vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg);
                         PrintUsage();
                         return -1;
                 }
             }
         }
 
-        if( ! optionFound )
+        if (!optionFound)
         {
             char *t = NULL;
-            long number = strtol( arg, &t, 0 );
-            if( t != arg )
+            long number = strtol(arg, &t, 0);
+            if (t != arg)
             {
-                if( 0 == gStartTestNumber )
-                    gStartTestNumber = (int32_t) number;
+                if (0 == gStartTestNumber)
+                    gStartTestNumber = (int32_t)number;
                 else
-                    gEndTestNumber = gStartTestNumber + (int32_t) number;
+                    gEndTestNumber = gStartTestNumber + (int32_t)number;
             }
             else
             {
                 // Make sure this is a valid name
                 unsigned int k;
-                for (k=0; k<functionListCount; k++)
+                for (k = 0; k < functionListCount; k++)
                 {
-                    const Func *f = functionList+k;
+                    const Func *f = functionList + k;
                     if (strcmp(arg, f->name) == 0)
                     {
-                        gTestNames[ gTestNameCount ] = arg;
+                        gTestNames[gTestNameCount] = arg;
                         gTestNameCount++;
                         break;
                     }
@@ -1021,118 +1035,141 @@ static int ParseArgs( int argc, const char **argv )
     }
 
     // Check for the wimpy mode environment variable
-    if (getenv("CL_WIMPY_MODE")) {
-      vlog( "\n" );
-      vlog( "*** Detected CL_WIMPY_MODE env                          ***\n" );
-      gWimpyMode = 1;
+    if (getenv("CL_WIMPY_MODE"))
+    {
+        vlog("\n");
+        vlog("*** Detected CL_WIMPY_MODE env                          ***\n");
+        gWimpyMode = 1;
     }
 
-    vlog( "\nTest binary built %s %s\n", __DATE__, __TIME__ );
+    vlog("\nTest binary built %s %s\n", __DATE__, __TIME__);
 
     PrintArch();
 
-    if( gWimpyMode )
+    if (gWimpyMode)
     {
-        vlog( "\n" );
-        vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
-        vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
-        vlog( "*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor );
+        vlog("\n");
+        vlog("*** WARNING: Testing in Wimpy mode!                     ***\n");
+        vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n");
+        vlog("*** Wimpy Reduction Factor: %-27u ***\n\n",
+             gWimpyReductionFactor);
     }
 
-    if( singleThreaded )
-        SetThreadCount(1);
+    if (singleThreaded) SetThreadCount(1);
 
     return 0;
 }
 
 
-static void PrintFunctions ( void )
+static void PrintFunctions(void)
 {
-  vlog( "\nMath function names:\n" );
-  for( int i = 0; i < functionListCount; i++ )
-  {
-    vlog( "\t%s\n", functionList[ i ].name );
-  }
+    vlog("\nMath function names:\n");
+    for (int i = 0; i < functionListCount; i++)
+    {
+        vlog("\t%s\n", functionList[i].name);
+    }
 }
 
-static void PrintUsage( void )
+static void PrintUsage(void)
 {
-    vlog( "%s [-acglstz]: <optional: math function names>\n", appName );
-    vlog( "\toptions:\n" );
-    vlog( "\t\t-a\tReport average times instead of best times\n" );
-    vlog( "\t\t-c\tToggle test fp correctly rounded divide and sqrt (Default: off)\n");
-    vlog( "\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 on)\n" );
-    vlog( "\t\t-f\tToggle float precision testing. (Default: on)\n" );
-    vlog( "\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n" );
-    vlog( "\t\t-e\tToggle test as derived implementations for fast relaxed math precision. (Default: on)\n" );
-    vlog( "\t\t-h\tPrint this message and quit\n" );
-    vlog( "\t\t-p\tPrint all math function names and quit\n" );
-    vlog( "\t\t-l\tlink check only (make sure functions are present, skip accuracy checks.)\n" );
-    vlog( "\t\t-m\tToggle run multi-threaded. (Default: on) )\n" );
-    vlog( "\t\t-s\tStop on error\n" );
-    vlog( "\t\t-t\tToggle timing  (on by default)\n" );
-    vlog( "\t\t-w\tToggle Wimpy Mode, * Not a valid test * \n");
-    vlog( "\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-10, default factor(%u)\n",gWimpyReductionFactor );
-    vlog( "\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by device capabilities by default.)\n" );
-    vlog( "\t\t-v\tToggle Verbosity (Default: off)\n ");
-    vlog( "\t\t-#\tTest only vector sizes #, e.g. \"-1\" tests scalar only, \"-16\" tests 16-wide vectors only.\n" );
-    vlog( "\n\tYou may also pass a number instead of a function name.\n" );
-    vlog( "\tThis causes the first N tests to be skipped. The tests are numbered.\n" );
-    vlog( "\tIf you pass a second number, that is the number tests to run after the first one.\n" );
-    vlog( "\tA name list may be used in conjunction with a number range. In that case,\n" );
-    vlog( "\tonly the named cases in the number range will run.\n" );
-    vlog( "\tYou may also choose to pass no arguments, in which case all tests will be run.\n" );
-    vlog( "\tYou may pass CL_DEVICE_TYPE_CPU/GPU/ACCELERATOR to select the device.\n" );
-    vlog( "\n" );
+    vlog("%s [-acglstz]: <optional: math function names>\n", appName);
+    vlog("\toptions:\n");
+    vlog("\t\t-a\tReport average times instead of best times\n");
+    vlog("\t\t-c\tToggle test fp correctly rounded divide and sqrt (Default: "
+         "off)\n");
+    vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 "
+         "on)\n");
+    vlog("\t\t-f\tToggle float precision testing. (Default: on)\n");
+    vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n");
+    vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math "
+         "precision. (Default: on)\n");
+    vlog("\t\t-h\tPrint this message and quit\n");
+    vlog("\t\t-p\tPrint all math function names and quit\n");
+    vlog("\t\t-l\tlink check only (make sure functions are present, skip "
+         "accuracy checks.)\n");
+    vlog("\t\t-m\tToggle run multi-threaded. (Default: on) )\n");
+    vlog("\t\t-s\tStop on error\n");
+    vlog("\t\t-t\tToggle timing  (on by default)\n");
+    vlog("\t\t-w\tToggle Wimpy Mode, * Not a valid test * \n");
+    vlog("\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
+         "1-10, default factor(%u)\n",
+         gWimpyReductionFactor);
+    vlog("\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by "
+         "device capabilities by default.)\n");
+    vlog("\t\t-v\tToggle Verbosity (Default: off)\n ");
+    vlog("\t\t-#\tTest only vector sizes #, e.g. \"-1\" tests scalar only, "
+         "\"-16\" tests 16-wide vectors only.\n");
+    vlog("\n\tYou may also pass a number instead of a function name.\n");
+    vlog("\tThis causes the first N tests to be skipped. The tests are "
+         "numbered.\n");
+    vlog("\tIf you pass a second number, that is the number tests to run after "
+         "the first one.\n");
+    vlog("\tA name list may be used in conjunction with a number range. In "
+         "that case,\n");
+    vlog("\tonly the named cases in the number range will run.\n");
+    vlog("\tYou may also choose to pass no arguments, in which case all tests "
+         "will be run.\n");
+    vlog("\tYou may pass CL_DEVICE_TYPE_CPU/GPU/ACCELERATOR to select the "
+         "device.\n");
+    vlog("\n");
 }
 
-static void CL_CALLBACK bruteforce_notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
+static void CL_CALLBACK bruteforce_notify_callback(const char *errinfo,
+                                                   const void *private_info,
+                                                   size_t cb, void *user_data)
 {
-    vlog( "%s  (%p, %zd, %p)\n", errinfo, private_info, cb, user_data );
+    vlog("%s  (%p, %zd, %p)\n", errinfo, private_info, cb, user_data);
 }
 
-test_status InitCL( cl_device_id device )
+test_status InitCL(cl_device_id device)
 {
     int error;
     uint32_t i;
-    size_t configSize = sizeof( gComputeDevices );
+    size_t configSize = sizeof(gComputeDevices);
     cl_device_type device_type;
 
-    error = clGetDeviceInfo( device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL );
-    if( error )
+    error = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type),
+                            &device_type, NULL);
+    if (error)
     {
-        print_error( error, "Unable to get device type" );
+        print_error(error, "Unable to get device type");
         return TEST_FAIL;
     }
 
     gDevice = device;
-    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) )
+    if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                 configSize, &gComputeDevices, NULL)))
         gComputeDevices = 1;
 
     // Check extensions
-    if(is_extension_available(gDevice, "cl_khr_fp64"))
+    if (is_extension_available(gDevice, "cl_khr_fp64"))
     {
         gHasDouble ^= 1;
-#if defined( CL_DEVICE_DOUBLE_FP_CONFIG )
-        if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(gDoubleCapabilities), &gDoubleCapabilities, NULL)))
+#if defined(CL_DEVICE_DOUBLE_FP_CONFIG)
+        if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_DOUBLE_FP_CONFIG,
+                                     sizeof(gDoubleCapabilities),
+                                     &gDoubleCapabilities, NULL)))
         {
-            vlog_error( "ERROR: Unable to get device CL_DEVICE_DOUBLE_FP_CONFIG. (%d)\n", error );
+            vlog_error("ERROR: Unable to get device "
+                       "CL_DEVICE_DOUBLE_FP_CONFIG. (%d)\n",
+                       error);
             return TEST_FAIL;
         }
 
-        if( DOUBLE_REQUIRED_FEATURES != (gDoubleCapabilities & DOUBLE_REQUIRED_FEATURES) )
+        if (DOUBLE_REQUIRED_FEATURES
+            != (gDoubleCapabilities & DOUBLE_REQUIRED_FEATURES))
         {
             std::string list;
             if (0 == (gDoubleCapabilities & CL_FP_FMA)) list += "CL_FP_FMA, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_NEAREST) )
+            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_NEAREST))
                 list += "CL_FP_ROUND_TO_NEAREST, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_ZERO) )
+            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_ZERO))
                 list += "CL_FP_ROUND_TO_ZERO, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_INF) )
+            if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_INF))
                 list += "CL_FP_ROUND_TO_INF, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_INF_NAN) )
+            if (0 == (gDoubleCapabilities & CL_FP_INF_NAN))
                 list += "CL_FP_INF_NAN, ";
-            if( 0 == (gDoubleCapabilities & CL_FP_DENORM) )
+            if (0 == (gDoubleCapabilities & CL_FP_DENORM))
                 list += "CL_FP_DENORM, ";
             vlog_error("ERROR: required double features are missing: %s\n",
                        list.c_str());
@@ -1140,100 +1177,104 @@ test_status InitCL( cl_device_id device )
             return TEST_FAIL;
         }
 #else
-        vlog_error( "FAIL: device says it supports cl_khr_fp64 but CL_DEVICE_DOUBLE_FP_CONFIG is not in the headers!\n" );
+        vlog_error("FAIL: device says it supports cl_khr_fp64 but "
+                   "CL_DEVICE_DOUBLE_FP_CONFIG is not in the headers!\n");
         return TEST_FAIL;
 #endif
     }
 
-    configSize = sizeof( gDeviceFrequency );
-    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) )
+    configSize = sizeof(gDeviceFrequency);
+    if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                                 configSize, &gDeviceFrequency, NULL)))
         gDeviceFrequency = 0;
 
-    if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(gFloatCapabilities), &gFloatCapabilities, NULL)))
+    if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG,
+                                 sizeof(gFloatCapabilities),
+                                 &gFloatCapabilities, NULL)))
     {
-        vlog_error( "ERROR: Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n", error );
+        vlog_error(
+            "ERROR: Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n",
+            error);
         return TEST_FAIL;
     }
 
-    gContext = clCreateContext( NULL, 1, &gDevice, bruteforce_notify_callback, NULL, &error );
-    if( NULL == gContext || error )
+    gContext = clCreateContext(NULL, 1, &gDevice, bruteforce_notify_callback,
+                               NULL, &error);
+    if (NULL == gContext || error)
     {
-        vlog_error( "clCreateContext failed. (%d) \n", error );
+        vlog_error("clCreateContext failed. (%d) \n", error);
         return TEST_FAIL;
     }
 
     gQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-    if( NULL == gQueue || error )
+    if (NULL == gQueue || error)
     {
-        vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+        vlog_error("clCreateCommandQueue failed. (%d)\n", error);
         return TEST_FAIL;
     }
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     // FIXME: use clProtectedArray
 #endif
-    //Allocate buffers
+    // Allocate buffers
     cl_uint min_alignment = 0;
-    error = clGetDeviceInfo (gDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), (void*)&min_alignment, NULL);
+    error = clGetDeviceInfo(gDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+                            sizeof(cl_uint), (void *)&min_alignment, NULL);
     if (CL_SUCCESS != error)
     {
-        vlog_error( "clGetDeviceInfo failed. (%d)\n", error );
+        vlog_error("clGetDeviceInfo failed. (%d)\n", error);
         return TEST_FAIL;
     }
-    min_alignment >>= 3;    // convert bits to bytes
+    min_alignment >>= 3; // convert bits to bytes
 
-    gIn   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gIn )
-        return TEST_FAIL;
-    gIn2   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gIn2 )
-        return TEST_FAIL;
-    gIn3   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gIn3 )
-        return TEST_FAIL;
-    gOut_Ref   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gOut_Ref )
-        return TEST_FAIL;
-    gOut_Ref2   = align_malloc( BUFFER_SIZE, min_alignment );
-    if( NULL == gOut_Ref2 )
-        return TEST_FAIL;
+    gIn = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gIn) return TEST_FAIL;
+    gIn2 = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gIn2) return TEST_FAIL;
+    gIn3 = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gIn3) return TEST_FAIL;
+    gOut_Ref = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gOut_Ref) return TEST_FAIL;
+    gOut_Ref2 = align_malloc(BUFFER_SIZE, min_alignment);
+    if (NULL == gOut_Ref2) return TEST_FAIL;
 
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        gOut[i] = align_malloc( BUFFER_SIZE, min_alignment );
-        if( NULL == gOut[i] )
-            return TEST_FAIL;
-        gOut2[i] = align_malloc( BUFFER_SIZE, min_alignment );
-        if( NULL == gOut2[i] )
-            return TEST_FAIL;
+        gOut[i] = align_malloc(BUFFER_SIZE, min_alignment);
+        if (NULL == gOut[i]) return TEST_FAIL;
+        gOut2[i] = align_malloc(BUFFER_SIZE, min_alignment);
+        if (NULL == gOut2[i]) return TEST_FAIL;
     }
 
     cl_mem_flags device_flags = CL_MEM_READ_ONLY;
     // save a copy on the host device to make this go faster
-    if( CL_DEVICE_TYPE_CPU == device_type )
+    if (CL_DEVICE_TYPE_CPU == device_type)
         device_flags |= CL_MEM_USE_HOST_PTR;
-      else
-          device_flags |= CL_MEM_COPY_HOST_PTR;
+    else
+        device_flags |= CL_MEM_COPY_HOST_PTR;
 
     // setup input buffers
-    gInBuffer = clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn, &error);
-    if( gInBuffer == NULL || error )
+    gInBuffer =
+        clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn, &error);
+    if (gInBuffer == NULL || error)
     {
-        vlog_error( "clCreateBuffer1 failed for input (%d)\n", error );
+        vlog_error("clCreateBuffer1 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
-    gInBuffer2 = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gIn2, &error );
-    if( gInBuffer2 == NULL || error )
+    gInBuffer2 =
+        clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn2, &error);
+    if (gInBuffer2 == NULL || error)
     {
-        vlog_error( "clCreateArray2 failed for input (%d)\n" , error );
+        vlog_error("clCreateArray2 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
-    gInBuffer3 = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gIn3, &error );
-    if( gInBuffer3 == NULL  || error)
+    gInBuffer3 =
+        clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn3, &error);
+    if (gInBuffer3 == NULL || error)
     {
-        vlog_error( "clCreateArray3 failed for input (%d)\n", error );
+        vlog_error("clCreateArray3 failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
@@ -1241,38 +1282,40 @@ test_status InitCL( cl_device_id device )
     // setup output buffers
     device_flags = CL_MEM_READ_WRITE;
     // save a copy on the host device to make this go faster
-    if( CL_DEVICE_TYPE_CPU == device_type )
+    if (CL_DEVICE_TYPE_CPU == device_type)
         device_flags |= CL_MEM_USE_HOST_PTR;
-      else
-          device_flags |= CL_MEM_COPY_HOST_PTR;
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    else
+        device_flags |= CL_MEM_COPY_HOST_PTR;
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        gOutBuffer[i] = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gOut[i], &error );
-        if( gOutBuffer[i] == NULL || error )
+        gOutBuffer[i] = clCreateBuffer(gContext, device_flags, BUFFER_SIZE,
+                                       gOut[i], &error);
+        if (gOutBuffer[i] == NULL || error)
         {
-            vlog_error( "clCreateArray failed for output (%d)\n", error  );
+            vlog_error("clCreateArray failed for output (%d)\n", error);
             return TEST_FAIL;
         }
-        gOutBuffer2[i] = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gOut2[i], &error );
-        if( gOutBuffer2[i] == NULL || error)
+        gOutBuffer2[i] = clCreateBuffer(gContext, device_flags, BUFFER_SIZE,
+                                        gOut2[i], &error);
+        if (gOutBuffer2[i] == NULL || error)
         {
-            vlog_error( "clCreateArray2 failed for output (%d)\n", error );
+            vlog_error("clCreateArray2 failed for output (%d)\n", error);
             return TEST_FAIL;
         }
     }
 
     // we are embedded, check current rounding mode
-    if( gIsEmbedded )
+    if (gIsEmbedded)
     {
         gIsInRTZMode = IsInRTZMode();
     }
 
-    //Check tininess detection
+    // Check tininess detection
     IsTininessDetectedBeforeRounding();
 
     cl_platform_id platform;
     int err = clGetPlatformIDs(1, &platform, NULL);
-    if( err )
+    if (err)
     {
         print_error(err, "clGetPlatformIDs failed");
         return TEST_FAIL;
@@ -1280,78 +1323,97 @@ test_status InitCL( cl_device_id device )
 
     char c[1024];
     static const char *no_yes[] = { "NO", "YES" };
-    vlog( "\nCompute Device info:\n" );
+    vlog("\nCompute Device info:\n");
     clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tPlatform Version: %s\n", c );
+    vlog("\tPlatform Version: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_NAME, sizeof(c), &c, NULL);
-    vlog( "\tDevice Name: %s\n", c );
+    vlog("\tDevice Name: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_VENDOR, sizeof(c), &c, NULL);
-    vlog( "\tVendor: %s\n", c );
+    vlog("\tVendor: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tDevice Version: %s\n", c );
+    vlog("\tDevice Version: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tCL C Version: %s\n", c );
+    vlog("\tCL C Version: %s\n", c);
     clGetDeviceInfo(gDevice, CL_DRIVER_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tDriver Version: %s\n", c );
-    vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency );
-    vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & gFloatCapabilities)] );
-    vlog( "\tCorrectly rounded divide and sqrt supported for floats? %s\n", no_yes[0 != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)] );
-    if( gToggleCorrectlyRoundedDivideSqrt )
+    vlog("\tDriver Version: %s\n", c);
+    vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency);
+    vlog("\tSubnormal values supported for floats? %s\n",
+         no_yes[0 != (CL_FP_DENORM & gFloatCapabilities)]);
+    vlog("\tCorrectly rounded divide and sqrt supported for floats? %s\n",
+         no_yes[0
+                != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)]);
+    if (gToggleCorrectlyRoundedDivideSqrt)
     {
         gFloatCapabilities ^= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
-    vlog( "\tTesting with correctly rounded float divide and sqrt? %s\n", no_yes[0 != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)] );
-    vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities)] );
-    vlog( "\tTesting single precision? %s\n", no_yes[0 != gTestFloat] );
-    vlog( "\tTesting fast relaxed math? %s\n", no_yes[0 != gTestFastRelaxed] );
-    if(gTestFastRelaxed)
+    vlog("\tTesting with correctly rounded float divide and sqrt? %s\n",
+         no_yes[0
+                != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)]);
+    vlog("\tTesting with FTZ mode ON for floats? %s\n",
+         no_yes[0 != gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities)]);
+    vlog("\tTesting single precision? %s\n", no_yes[0 != gTestFloat]);
+    vlog("\tTesting fast relaxed math? %s\n", no_yes[0 != gTestFastRelaxed]);
+    if (gTestFastRelaxed)
     {
-      vlog( "\tFast relaxed math has derived implementations? %s\n", no_yes[0 != gFastRelaxedDerived] );
+        vlog("\tFast relaxed math has derived implementations? %s\n",
+             no_yes[0 != gFastRelaxedDerived]);
     }
-    vlog( "\tTesting double precision? %s\n", no_yes[0 != gHasDouble] );
-    if( sizeof( long double) == sizeof( double ) && gHasDouble )
+    vlog("\tTesting double precision? %s\n", no_yes[0 != gHasDouble]);
+    if (sizeof(long double) == sizeof(double) && gHasDouble)
     {
-        vlog( "\n\t\tWARNING: Host system long double does not have better precision than double!\n" );
-        vlog( "\t\t         All double results that do not match the reference result have their reported\n" );
-        vlog( "\t\t         error inflated by 0.5 ulps to account for the fact that this system\n" );
-        vlog( "\t\t         can not accurately represent the right result to an accuracy closer\n" );
-        vlog( "\t\t         than half an ulp. See comments in Bruteforce_Ulp_Error_Double() for more details.\n\n" );
+        vlog("\n\t\tWARNING: Host system long double does not have better "
+             "precision than double!\n");
+        vlog("\t\t         All double results that do not match the reference "
+             "result have their reported\n");
+        vlog("\t\t         error inflated by 0.5 ulps to account for the fact "
+             "that this system\n");
+        vlog("\t\t         can not accurately represent the right result to an "
+             "accuracy closer\n");
+        vlog("\t\t         than half an ulp. See comments in "
+             "Bruteforce_Ulp_Error_Double() for more details.\n\n");
     }
 
-    vlog( "\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded] );
-    if( gIsEmbedded )
-        vlog( "\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode] );
-    vlog( "\tTininess is detected before rounding? %s\n", no_yes[0 != gCheckTininessBeforeRounding] );
-    vlog( "\tWorker threads: %d\n", GetThreadCount() );
-    vlog( "\tTesting vector sizes:" );
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        vlog( "\t%d", sizeValues[i] );
+    vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]);
+    if (gIsEmbedded)
+        vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]);
+    vlog("\tTininess is detected before rounding? %s\n",
+         no_yes[0 != gCheckTininessBeforeRounding]);
+    vlog("\tWorker threads: %d\n", GetThreadCount());
+    vlog("\tTesting vector sizes:");
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+        vlog("\t%d", sizeValues[i]);
 
     vlog("\n");
     vlog("\tVerbose? %s\n", no_yes[0 != gVerboseBruteForce]);
-    vlog( "\n\n" );
+    vlog("\n\n");
 
-    // Check to see if we are using single threaded mode on other than a 1.0 device
-    if (getenv( "CL_TEST_SINGLE_THREADED" )) {
+    // Check to see if we are using single threaded mode on other than a 1.0
+    // device
+    if (getenv("CL_TEST_SINGLE_THREADED"))
+    {
 
-      char device_version[1024] = { 0 };
-      clGetDeviceInfo( gDevice, CL_DEVICE_VERSION, sizeof(device_version), device_version, NULL );
+        char device_version[1024] = { 0 };
+        clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(device_version),
+                        device_version, NULL);
 
-      if (strcmp("OpenCL 1.0 ",device_version)) {
-        vlog("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n");
-      }
+        if (strcmp("OpenCL 1.0 ", device_version))
+        {
+            vlog("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. "
+                 "Running single threaded.\n");
+        }
     }
 
     return TEST_PASS;
 }
 
-static void ReleaseCL( void )
+static void ReleaseCL(void)
 {
     uint32_t i;
     clReleaseMemObject(gInBuffer);
     clReleaseMemObject(gInBuffer2);
     clReleaseMemObject(gInBuffer3);
-    for ( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) {
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
         clReleaseMemObject(gOutBuffer[i]);
         clReleaseMemObject(gOutBuffer2[i]);
     }
@@ -1364,25 +1426,27 @@ static void ReleaseCL( void )
     align_free(gOut_Ref);
     align_free(gOut_Ref2);
 
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         align_free(gOut[i]);
         align_free(gOut2[i]);
     }
 }
 
-void _LogBuildError( cl_program p, int line, const char *file )
+void _LogBuildError(cl_program p, int line, const char *file)
 {
     char the_log[2048] = "";
 
-    vlog_error( "%s:%d: Build Log:\n", file, line );
-    if( 0 == clGetProgramBuildInfo(p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(the_log), the_log, NULL) )
-        vlog_error( "%s", the_log );
+    vlog_error("%s:%d: Build Log:\n", file, line);
+    if (0
+        == clGetProgramBuildInfo(p, gDevice, CL_PROGRAM_BUILD_LOG,
+                                 sizeof(the_log), the_log, NULL))
+        vlog_error("%s", the_log);
     else
-        vlog_error( "*** Error getting build log for program %p\n", p );
+        vlog_error("*** Error getting build log for program %p\n", p);
 }
 
-int InitILogbConstants( void )
+int InitILogbConstants(void)
 {
     int error;
     const char *kernelSource =
@@ -1408,7 +1472,9 @@ int InitILogbConstants( void )
              clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
                             &gOutBuffer[gMinVectorSizeIndex])))
     {
-        vlog_error( "Error: Unable to set kernel arg to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error );
+        vlog_error("Error: Unable to set kernel arg to get FP_ILOGB0 and "
+                   "FP_ILOGBNAN for the device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1416,14 +1482,23 @@ int InitILogbConstants( void )
     if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
                                         NULL, NULL)))
     {
-        vlog_error( "Error: Unable to execute kernel to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error );
+        vlog_error("Error: Unable to execute kernel to get FP_ILOGB0 and "
+                   "FP_ILOGBNAN for the device. Err = %d",
+                   error);
         return error;
     }
 
-    struct{ cl_int ilogb0, ilogbnan; }data;
-    if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL)))
+    struct
     {
-        vlog_error( "Error: unable to read FP_ILOGB0 and FP_ILOGBNAN from the device. Err = %d", error );
+        cl_int ilogb0, ilogbnan;
+    } data;
+    if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex],
+                                     CL_TRUE, 0, sizeof(data), &data, 0, NULL,
+                                     NULL)))
+    {
+        vlog_error("Error: unable to read FP_ILOGB0 and FP_ILOGBNAN from the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1433,7 +1508,7 @@ int InitILogbConstants( void )
     return 0;
 }
 
-int IsTininessDetectedBeforeRounding( void )
+int IsTininessDetectedBeforeRounding(void)
 {
     int error;
     const char *kernelSource =
@@ -1449,7 +1524,8 @@ int IsTininessDetectedBeforeRounding( void )
     error =
         create_single_kernel_helper(gContext, &query, &kernel, 1, &kernelSource,
                                     "IsTininessDetectedBeforeRounding");
-    if (error != CL_SUCCESS) {
+    if (error != CL_SUCCESS)
+    {
         vlog_error("Error: Unable to create kernel to detect how tininess is "
                    "detected for the device. (%d)",
                    error);
@@ -1460,7 +1536,9 @@ int IsTininessDetectedBeforeRounding( void )
              clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
                             &gOutBuffer[gMinVectorSizeIndex])))
     {
-        vlog_error( "Error: Unable to set kernel arg to detect how tininess is detected  for the device. Err = %d", error );
+        vlog_error("Error: Unable to set kernel arg to detect how tininess is "
+                   "detected  for the device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1468,14 +1546,23 @@ int IsTininessDetectedBeforeRounding( void )
     if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
                                         NULL, NULL)))
     {
-        vlog_error( "Error: Unable to execute kernel to detect how tininess is detected  for the device. Err = %d", error );
+        vlog_error("Error: Unable to execute kernel to detect how tininess is "
+                   "detected  for the device. Err = %d",
+                   error);
         return error;
     }
 
-    struct{ cl_uint f; }data;
-    if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL)))
+    struct
     {
-        vlog_error( "Error: unable to read result from tininess test from the device. Err = %d", error );
+        cl_uint f;
+    } data;
+    if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex],
+                                     CL_TRUE, 0, sizeof(data), &data, 0, NULL,
+                                     NULL)))
+    {
+        vlog_error("Error: unable to read result from tininess test from the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1491,14 +1578,14 @@ int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k,
     int error = 0;
     char options[200] = "";
 
-    if( gForceFTZ )
+    if (gForceFTZ)
     {
-      strcat(options," -cl-denorms-are-zero");
+        strcat(options, " -cl-denorms-are-zero");
     }
 
     if (relaxedMode)
     {
-      strcat(options, " -cl-fast-relaxed-math");
+        strcat(options, " -cl-fast-relaxed-math");
     }
 
     error =
@@ -1522,39 +1609,41 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
 
     if (gForceFTZ)
     {
-      strcat(options," -cl-denorms-are-zero ");
+        strcat(options, " -cl-denorms-are-zero ");
     }
 
-    if( gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT )
+    if (gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)
     {
-      strcat(options," -cl-fp32-correctly-rounded-divide-sqrt ");
+        strcat(options, " -cl-fp32-correctly-rounded-divide-sqrt ");
     }
 
     if (relaxedMode)
     {
-      strcat(options, " -cl-fast-relaxed-math");
+        strcat(options, " -cl-fast-relaxed-math");
     }
 
-    error = create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options);
-    if ( error != CL_SUCCESS )
+    error =
+        create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options);
+    if (error != CL_SUCCESS)
     {
-        vlog_error( "\t\tFAILED -- Failed to create program. (%d)\n", error );
+        vlog_error("\t\tFAILED -- Failed to create program. (%d)\n", error);
         return error;
     }
 
 
-    memset( k, 0, kernel_count * sizeof( *k) );
-    for( i = 0; i< kernel_count; i++ )
+    memset(k, 0, kernel_count * sizeof(*k));
+    for (i = 0; i < kernel_count; i++)
     {
-        k[i] = clCreateKernel( *p, name, &error );
-        if( NULL == k[i]|| error )
+        k[i] = clCreateKernel(*p, name, &error);
+        if (NULL == k[i] || error)
         {
-            char    buffer[2048] = "";
+            char buffer[2048] = "";
 
             vlog_error("\t\tFAILED -- clCreateKernel() failed: (%d)\n", error);
-            clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
+            clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG,
+                                  sizeof(buffer), buffer, NULL);
             vlog_error("Log: %s\n", buffer);
-            clReleaseProgram( *p );
+            clReleaseProgram(*p);
             return error;
         }
     }
@@ -1563,7 +1652,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
 }
 
 
-static int IsInRTZMode( void )
+static int IsInRTZMode(void)
 {
     int error;
     const char *kernelSource =
@@ -1578,7 +1667,8 @@ static int IsInRTZMode( void )
     clKernelWrapper kernel;
     error = create_single_kernel_helper(gContext, &query, &kernel, 1,
                                         &kernelSource, "GetRoundingMode");
-    if (error != CL_SUCCESS) {
+    if (error != CL_SUCCESS)
+    {
         vlog_error("Error: Unable to create kernel to detect RTZ mode for the "
                    "device. (%d)",
                    error);
@@ -1589,7 +1679,9 @@ static int IsInRTZMode( void )
              clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]),
                             &gOutBuffer[gMinVectorSizeIndex])))
     {
-        vlog_error( "Error: Unable to set kernel arg to detect RTZ mode for the device. Err = %d", error );
+        vlog_error("Error: Unable to set kernel arg to detect RTZ mode for the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
@@ -1597,14 +1689,23 @@ static int IsInRTZMode( void )
     if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0,
                                         NULL, NULL)))
     {
-        vlog_error( "Error: Unable to execute kernel to detect RTZ mode for the device. Err = %d", error );
+        vlog_error("Error: Unable to execute kernel to detect RTZ mode for the "
+                   "device. Err = %d",
+                   error);
         return error;
     }
 
-    struct{ cl_int isRTZ; }data;
-    if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL)))
+    struct
     {
-        vlog_error( "Error: unable to read RTZ mode data from the device. Err = %d", error );
+        cl_int isRTZ;
+    } data;
+    if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex],
+                                     CL_TRUE, 0, sizeof(data), &data, 0, NULL,
+                                     NULL)))
+    {
+        vlog_error(
+            "Error: unable to read RTZ mode data from the device. Err = %d",
+            error);
         return error;
     }
 
@@ -1613,46 +1714,54 @@ static int IsInRTZMode( void )
 
 #pragma mark -
 
-const char *sizeNames[ VECTOR_SIZE_COUNT] = { "", "2", "3", "4", "8", "16" };
-const int  sizeValues[ VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
+const char *sizeNames[VECTOR_SIZE_COUNT] = { "", "2", "3", "4", "8", "16" };
+const int sizeValues[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 };
 
-// TODO: There is another version of Ulp_Error_Double defined in test_common/harness/errorHelpers.c
-float Bruteforce_Ulp_Error_Double( double test, long double reference )
+// TODO: There is another version of Ulp_Error_Double defined in
+// test_common/harness/errorHelpers.c
+float Bruteforce_Ulp_Error_Double(double test, long double reference)
 {
-//Check for Non-power-of-two and NaN
+    // Check for Non-power-of-two and NaN
 
-  // Note: This function presumes that someone has already tested whether the result is correctly,
-  // rounded before calling this function.  That test:
-  //
-  //    if( (float) reference == test )
-  //        return 0.0f;
-  //
-  // would ensure that cases like fabs(reference) > FLT_MAX are weeded out before we get here.
-  // Otherwise, we'll return inf ulp error here, for what are otherwise correctly rounded
-  // results.
+    // Note: This function presumes that someone has already tested whether the
+    // result is correctly, rounded before calling this function.  That test:
+    //
+    //    if( (float) reference == test )
+    //        return 0.0f;
+    //
+    // would ensure that cases like fabs(reference) > FLT_MAX are weeded out
+    // before we get here. Otherwise, we'll return inf ulp error here, for what
+    // are otherwise correctly rounded results.
 
-  // Deal with long double = double
-  // On most systems long double is a higher precision type than double. They provide either
-  // a 80-bit or greater floating point type, or they provide a head-tail double double format.
-  // That is sufficient to represent the accuracy of a floating point result to many more bits
-  // than double and we can calculate sub-ulp errors. This is the standard system for which this
-  // test suite is designed.
-  //
-  // On some systems double and long double are the same thing. Then we run into a problem,
-  // because our representation of the infinitely precise result (passed in as reference above)
-  // can be off by as much as a half double precision ulp itself.  In this case, we inflate the
-  // reported error by half an ulp to take this into account.  A more correct and permanent fix
-  // would be to undertake refactoring the reference code to return results in this format:
-  //
-  //    typedef struct DoubleReference
-  //    { // true value = correctlyRoundedResult + ulps * ulp(correctlyRoundedResult)        (infinitely precise)
-  //        double  correctlyRoundedResult;     // as best we can
-  //        double  ulps;                       // plus a fractional amount to account for the difference
-  //    }DoubleReference;                       //     between infinitely precise result and correctlyRoundedResult, in units of ulps.
-  //
-  // This would provide a useful higher-than-double precision format for everyone that we can use,
-  // and would solve a few problems with representing absolute errors below DBL_MIN and over DBL_MAX for systems
-  // that use a head to tail double double for long double.
+    // Deal with long double = double
+    // On most systems long double is a higher precision type than double. They
+    // provide either a 80-bit or greater floating point type, or they provide a
+    // head-tail double double format. That is sufficient to represent the
+    // accuracy of a floating point result to many more bits than double and we
+    // can calculate sub-ulp errors. This is the standard system for which this
+    // test suite is designed.
+    //
+    // On some systems double and long double are the same thing. Then we run
+    // into a problem, because our representation of the infinitely precise
+    // result (passed in as reference above) can be off by as much as a half
+    // double precision ulp itself.  In this case, we inflate the reported error
+    // by half an ulp to take this into account.  A more correct and permanent
+    // fix would be to undertake refactoring the reference code to return
+    // results in this format:
+    //
+    //    typedef struct DoubleReference
+    //    { // true value = correctlyRoundedResult + ulps *
+    //    ulp(correctlyRoundedResult)        (infinitely precise)
+    //        double  correctlyRoundedResult;     // as best we can
+    //        double  ulps;                       // plus a fractional amount to
+    //        account for the difference
+    //    }DoubleReference;                       //     between infinitely
+    //    precise result and correctlyRoundedResult, in units of ulps.
+    //
+    // This would provide a useful higher-than-double precision format for
+    // everyone that we can use, and would solve a few problems with
+    // representing absolute errors below DBL_MIN and over DBL_MAX for systems
+    // that use a head to tail double double for long double.
 
     int x;
     long double testVal = test;
@@ -1660,119 +1769,118 @@ float Bruteforce_Ulp_Error_Double( double test, long double reference )
     // First, handle special reference values
     if (isinf(reference))
     {
-    if (reference == testVal)
-        return 0.0f;
+        if (reference == testVal) return 0.0f;
 
-    return INFINITY;
+        return INFINITY;
     }
 
     if (isnan(reference))
     {
-    if (isnan(testVal))
-        return 0.0f;
+        if (isnan(testVal)) return 0.0f;
 
-    return INFINITY;
+        return INFINITY;
     }
 
-    if ( 0.0L != reference && 0.5L != frexpl(reference, &x) )
+    if (0.0L != reference && 0.5L != frexpl(reference, &x))
     { // Non-zero and Non-power of two
 
-       // allow correctly rounded results to pass through unmolested. (We might add error to it below.)
-       // There is something of a performance optimization here.
-        if( testVal == reference )
-            return 0.0f;
+        // allow correctly rounded results to pass through unmolested. (We might
+        // add error to it below.) There is something of a performance
+        // optimization here.
+        if (testVal == reference) return 0.0f;
 
         // The unbiased exponent of the ulp unit place
-        int ulp_exp = DBL_MANT_DIG - 1 - MAX( ilogbl( reference), DBL_MIN_EXP-1 );
+        int ulp_exp =
+            DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1);
 
         // Scale the exponent of the error
-        float result = (float) scalbnl( testVal - reference, ulp_exp );
+        float result = (float)scalbnl(testVal - reference, ulp_exp);
 
-        // account for rounding error in reference result on systems that do not have a higher precision floating point type (see above)
-        if( sizeof(long double) == sizeof( double ) )
-            result += copysignf( 0.5f, result);
+        // account for rounding error in reference result on systems that do not
+        // have a higher precision floating point type (see above)
+        if (sizeof(long double) == sizeof(double))
+            result += copysignf(0.5f, result);
 
         return result;
     }
 
     // reference is a normal power of two or a zero
     // The unbiased exponent of the ulp unit place
-    int ulp_exp =  DBL_MANT_DIG - 1 - MAX( ilogbl( reference) - 1, DBL_MIN_EXP-1 );
+    int ulp_exp =
+        DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1);
 
-   // allow correctly rounded results to pass through unmolested. (We might add error to it below.)
-   // There is something of a performance optimization here too.
-    if( testVal == reference )
-        return 0.0f;
+    // allow correctly rounded results to pass through unmolested. (We might add
+    // error to it below.) There is something of a performance optimization here
+    // too.
+    if (testVal == reference) return 0.0f;
 
     // Scale the exponent of the error
-    float result = (float) scalbnl( testVal - reference, ulp_exp );
+    float result = (float)scalbnl(testVal - reference, ulp_exp);
 
-    // account for rounding error in reference result on systems that do not have a higher precision floating point type (see above)
-    if( sizeof(long double) == sizeof( double ) )
-        result += copysignf( 0.5f, result);
+    // account for rounding error in reference result on systems that do not
+    // have a higher precision floating point type (see above)
+    if (sizeof(long double) == sizeof(double))
+        result += copysignf(0.5f, result);
 
     return result;
 }
 
-float Abs_Error( float test, double reference )
+float Abs_Error(float test, double reference)
 {
-  if( isnan(test) && isnan(reference) )
-    return 0.0f;
-  return fabs((float)(reference-(double)test));
+    if (isnan(test) && isnan(reference)) return 0.0f;
+    return fabs((float)(reference - (double)test));
 }
 
-#if defined( __APPLE__ )
-    #include <mach/mach_time.h>
+#if defined(__APPLE__)
+#include <mach/mach_time.h>
 #endif
 
-uint64_t GetTime( void )
+uint64_t GetTime(void)
 {
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     return mach_absolute_time();
 #elif defined(_WIN32) && defined(_MSC_VER)
-    return  ReadTime();
+    return ReadTime();
 #else
-    //mach_absolute_time is a high precision timer with precision < 1 microsecond.
-    #warning need accurate clock here.  Times are invalid.
+// mach_absolute_time is a high precision timer with precision < 1 microsecond.
+#warning need accurate clock here.  Times are invalid.
     return 0;
 #endif
 }
 
 
-#if defined(_WIN32) && defined (_MSC_VER)
+#if defined(_WIN32) && defined(_MSC_VER)
 /* function is defined in "compat.h" */
 #else
-double SubtractTime( uint64_t endTime, uint64_t startTime )
+double SubtractTime(uint64_t endTime, uint64_t startTime)
 {
     uint64_t diff = endTime - startTime;
     static double conversion = 0.0;
 
-    if( 0.0 == conversion )
+    if (0.0 == conversion)
     {
-#if defined( __APPLE__ )
-        mach_timebase_info_data_t info = {0,0};
-        kern_return_t   err = mach_timebase_info( &info );
-        if( 0 == err )
-            conversion = 1e-9 * (double) info.numer / (double) info.denom;
+#if defined(__APPLE__)
+        mach_timebase_info_data_t info = { 0, 0 };
+        kern_return_t err = mach_timebase_info(&info);
+        if (0 == err)
+            conversion = 1e-9 * (double)info.numer / (double)info.denom;
 #else
-    // This function consumes output from GetTime() above, and converts the time to secionds.
-    #warning need accurate ticks to seconds conversion factor here. Times are invalid.
+// This function consumes output from GetTime() above, and converts the time to
+// secionds.
+#warning need accurate ticks to seconds conversion factor here. Times are invalid.
 #endif
     }
 
     // strictly speaking we should also be subtracting out timer latency here
-    return conversion * (double) diff;
+    return conversion * (double)diff;
 }
 #endif
 
-cl_uint RoundUpToNextPowerOfTwo( cl_uint x )
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
-    if( 0 == (x & (x-1)))
-        return x;
+    if (0 == (x & (x - 1))) return x;
 
-    while( x & (x-1) )
-        x &= x-1;
+    while (x & (x - 1)) x &= x - 1;
 
-    return x+x;
+    return x + x;
 }
-
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 01c99c14..1a5a6690 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -23,46 +23,47 @@
 
 #include "Utility.h"
 
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
-    #include <xmmintrin.h>
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
+#include <xmmintrin.h>
 #endif
-#if defined( __SSE2__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
-    #include <emmintrin.h>
+#if defined(__SSE2__)                                                          \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
+#include <emmintrin.h>
 #endif
 
 #ifndef M_PI_4
-    #define M_PI_4 (M_PI/4)
+#define M_PI_4 (M_PI / 4)
 #endif
 
-#define EVALUATE( x )       x
-#define CONCATENATE(x, y)  x ## EVALUATE(y)
+#define EVALUATE(x) x
+#define CONCATENATE(x, y) x##EVALUATE(y)
 
 #pragma STDC FP_CONTRACT OFF
 static void __log2_ep(double *hi, double *lo, double x);
 
-typedef union
-{
+typedef union {
     uint64_t i;
     double d;
-}uint64d_t;
+} uint64d_t;
 
 static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL };
 
 #define cl_make_nan() _CL_NAN.d
 
-static double reduce1( double x );
-static double reduce1( double x )
+static double reduce1(double x);
+static double reduce1(double x)
 {
-    if( fabs(x) >= HEX_DBL( +, 1, 0, +, 53 ) )
+    if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53))
     {
-        if( fabs(x) == INFINITY )
-            return cl_make_nan();
+        if (fabs(x) == INFINITY) return cl_make_nan();
 
-        return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs
+        return 0.0; // we patch up the sign for sinPi and cosPi later, since
+                    // they need different signs
     }
 
     // Find the nearest multiple of 2
-    const double r = copysign( HEX_DBL( +, 1, 0, +, 53 ), x );
+    const double r = copysign(HEX_DBL(+, 1, 0, +, 53), x);
     double z = x + r;
     z -= r;
 
@@ -79,7 +80,8 @@ static double reduceHalf( double x )
         if( fabs(x) == INFINITY )
             return cl_make_nan();
 
-        return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs
+        return 0.0; //we patch up the sign for sinPi and cosPi later, since they
+need different signs
     }
 
     // Find the nearest multiple of 1
@@ -92,362 +94,384 @@ static double reduceHalf( double x )
 }
 */
 
-double reference_acospi( double x) {  return reference_acos( x ) / M_PI;    }
-double reference_asinpi( double x) {  return reference_asin( x ) / M_PI;    }
-double reference_atanpi( double x) {  return reference_atan( x ) / M_PI;    }
-double reference_atan2pi( double y, double x ) { return reference_atan2( y, x) / M_PI; }
-double reference_cospi( double x)
+double reference_acospi(double x) { return reference_acos(x) / M_PI; }
+double reference_asinpi(double x) { return reference_asin(x) / M_PI; }
+double reference_atanpi(double x) { return reference_atan(x) / M_PI; }
+double reference_atan2pi(double y, double x)
 {
-    if( reference_fabs(x) >= HEX_DBL( +, 1, 0, +, 52 ) )
+    return reference_atan2(y, x) / M_PI;
+}
+double reference_cospi(double x)
+{
+    if (reference_fabs(x) >= HEX_DBL(+, 1, 0, +, 52))
     {
-        if( reference_fabs(x) == INFINITY )
-            return cl_make_nan();
+        if (reference_fabs(x) == INFINITY) return cl_make_nan();
 
-        //Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53.
-        //However, when starting with single precision inputs, there will be no odd values.
+        // Note this probably fails for odd values between 0x1.0p52 and
+        // 0x1.0p53. However, when starting with single precision inputs, there
+        // will be no odd values.
 
         return 1.0;
     }
 
-    x = reduce1(x+0.5);
+    x = reduce1(x + 0.5);
 
     // reduce to [-0.5, 0.5]
-    if( x < -0.5 )
+    if (x < -0.5)
         x = -1 - x;
-    else if ( x > 0.5 )
+    else if (x > 0.5)
         x = 1 - x;
 
     // cosPi zeros are all +0
-    if( x == 0.0 )
-        return 0.0;
+    if (x == 0.0) return 0.0;
 
-    return reference_sin( x * M_PI );
+    return reference_sin(x * M_PI);
 }
 
 double reference_relaxed_cospi(double x) { return reference_cospi(x); }
 
-double reference_relaxed_divide( double x, double y ) { return (float)(((float) x ) / ( (float) y )); }
+double reference_relaxed_divide(double x, double y)
+{
+    return (float)(((float)x) / ((float)y));
+}
 
-double reference_divide( double x, double y ) { return x / y; }
+double reference_divide(double x, double y) { return x / y; }
 
 // Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0
-static inline cl_ulong  add_carry( cl_ulong a, cl_ulong b, cl_ulong *carry )
+static inline cl_ulong add_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
 {
     cl_ulong result = a + b;
     *carry = result < a;
     return result;
 }
 
-// Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise 0
-static inline cl_ulong  sub_carry( cl_ulong a, cl_ulong b, cl_ulong *carry )
+// Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise
+// 0
+static inline cl_ulong sub_carry(cl_ulong a, cl_ulong b, cl_ulong *carry)
 {
     cl_ulong result = a - b;
     *carry = result > a;
     return result;
 }
 
-static float fallback_frexpf( float x, int *iptr )
+static float fallback_frexpf(float x, int *iptr)
 {
     cl_uint u, v;
     float fu, fv;
 
-    memcpy( &u, &x, sizeof(u));
+    memcpy(&u, &x, sizeof(u));
 
-    cl_uint exponent = u &  0x7f800000U;
+    cl_uint exponent = u & 0x7f800000U;
     cl_uint mantissa = u & ~0x7f800000U;
 
     // add 1 to the exponent
     exponent += 0x00800000U;
 
-    if( (cl_int) exponent < (cl_int) 0x01000000 )
+    if ((cl_int)exponent < (cl_int)0x01000000)
     { // subnormal, NaN, Inf
         mantissa |= 0x3f000000U;
 
         v = mantissa & 0xff800000U;
         u = mantissa;
-        memcpy( &fv, &v, sizeof(v));
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fv, &v, sizeof(v));
+        memcpy(&fu, &u, sizeof(u));
 
         fu -= fv;
 
-        memcpy( &v, &fv, sizeof(v));
-        memcpy( &u, &fu, sizeof(u));
+        memcpy(&v, &fv, sizeof(v));
+        memcpy(&u, &fu, sizeof(u));
 
-        exponent = u &  0x7f800000U;
+        exponent = u & 0x7f800000U;
         mantissa = u & ~0x7f800000U;
 
-        *iptr = (exponent >> 23) + (-126 + 1 -126);
+        *iptr = (exponent >> 23) + (-126 + 1 - 126);
         u = mantissa | 0x3f000000U;
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fu, &u, sizeof(u));
         return fu;
     }
 
     *iptr = (exponent >> 23) - 127;
     u = mantissa | 0x3f000000U;
-    memcpy( &fu, &u, sizeof(u));
+    memcpy(&fu, &u, sizeof(u));
     return fu;
 }
 
-static inline int extractf( float, cl_uint * );
-static inline int extractf( float x, cl_uint *mant )
+static inline int extractf(float, cl_uint *);
+static inline int extractf(float x, cl_uint *mant)
 {
-    static float (*frexppf)(float, int*) = NULL;
+    static float (*frexppf)(float, int *) = NULL;
     int e;
 
     // verify that frexp works properly
-    if( NULL == frexppf )
+    if (NULL == frexppf)
     {
-        if( 0.5f == frexpf( HEX_FLT( +, 1, 0, -, 130 ), &e ) && e == -129 )
+        if (0.5f == frexpf(HEX_FLT(+, 1, 0, -, 130), &e) && e == -129)
             frexppf = frexpf;
         else
             frexppf = fallback_frexpf;
     }
 
-    *mant = (cl_uint) (HEX_FLT( +, 1, 0, +, 32 ) * fabsf( frexppf( x, &e )));
+    *mant = (cl_uint)(HEX_FLT(+, 1, 0, +, 32) * fabsf(frexppf(x, &e)));
     return e - 1;
 }
 
-// Shift right by shift bits. Any bits lost on the right side are bitwise OR'd together and ORd into the LSB of the result
-static inline void shift_right_sticky_64( cl_ulong *p, int shift );
-static inline void shift_right_sticky_64( cl_ulong *p, int shift )
+// Shift right by shift bits. Any bits lost on the right side are bitwise OR'd
+// together and ORd into the LSB of the result
+static inline void shift_right_sticky_64(cl_ulong *p, int shift);
+static inline void shift_right_sticky_64(cl_ulong *p, int shift)
 {
     cl_ulong sticky = 0;
     cl_ulong r = *p;
 
     // C doesn't handle shifts greater than the size of the variable dependably
-    if( shift >= 64 )
+    if (shift >= 64)
     {
         sticky |= (0 != r);
         r = 0;
     }
     else
     {
-        sticky |= (0 != (r << (64-shift)));
+        sticky |= (0 != (r << (64 - shift)));
         r >>= shift;
     }
 
     *p = r | sticky;
 }
 
-// Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd into the LSB of the result
-static inline void add64( cl_ulong *p, cl_ulong c, int *exponent );
-static inline void add64( cl_ulong *p, cl_ulong c, int *exponent )
+// Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd
+// into the LSB of the result
+static inline void add64(cl_ulong *p, cl_ulong c, int *exponent);
+static inline void add64(cl_ulong *p, cl_ulong c, int *exponent)
 {
     cl_ulong carry;
     c = add_carry(c, *p, &carry);
-    if( carry )
+    if (carry)
     {
-        carry = c & 1;                              // set aside sticky bit
-        c >>= 1;                                    // right shift to deal with overflow
-        c |= carry | 0x8000000000000000ULL;         // or in carry bit, and sticky bit. The latter is to prevent rounding from believing we are exact half way case
-        *exponent = *exponent + 1;                  // adjust exponent
+        carry = c & 1; // set aside sticky bit
+        c >>= 1; // right shift to deal with overflow
+        c |= carry
+            | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The
+                                     // latter is to prevent rounding from
+                                     // believing we are exact half way case
+        *exponent = *exponent + 1; // adjust exponent
     }
 
     *p = c;
 }
 
 // IEEE-754 round to nearest, ties to even rounding
-static float round_to_nearest_even_float( cl_ulong p, int exponent );
-static float round_to_nearest_even_float( cl_ulong p, int exponent )
+static float round_to_nearest_even_float(cl_ulong p, int exponent);
+static float round_to_nearest_even_float(cl_ulong p, int exponent)
 {
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-            return INFINITY;
+        if (r > CL_FLT_MAX) return INFINITY;
 
         return r;
     }
-    if( exponent == -150 && p > 0x8000000000000000ULL)
-        return HEX_FLT( +, 1, 0, -, 149 );
-    if( exponent <= -150 )       return 0.0f;
+    if (exponent == -150 && p > 0x8000000000000000ULL)
+        return HEX_FLT(+, 1, 0, -, 149);
+    if (exponent <= -150) return 0.0f;
 
-    //Figure out which bits go where
+    // Figure out which bits go where
     int shift = 8 + 32;
-    if( exponent < -126 )
+    if (exponent < -126)
     {
-        shift -= 126 + exponent;                    // subnormal: shift is not 52
-        exponent = -127;                            //            set exponent to 0
+        shift -= 126 + exponent; // subnormal: shift is not 52
+        exponent = -127; //            set exponent to 0
     }
     else
-        p &= 0x7fffffffffffffffULL;                 // normal: leading bit is implicit. Remove it.
+        p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                    // it.
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     // put a representation of the residual bits into hi
-    p <<= (64-shift);
+    p <<= (64 - shift);
 
-    //round to nearest, ties to even  based on the unused portion of p
-    if( p < 0x8000000000000000ULL )        return u.d;
-    if( p == 0x8000000000000000ULL )       u.u += u.u & 1U;
-    else                                   u.u++;
+    // round to nearest, ties to even  based on the unused portion of p
+    if (p < 0x8000000000000000ULL) return u.d;
+    if (p == 0x8000000000000000ULL)
+        u.u += u.u & 1U;
+    else
+        u.u++;
 
     return u.d;
 }
 
-static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent );
-static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent )
+static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent);
+static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent)
 {
     extern int gCheckTininessBeforeRounding;
 
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
     int shift = 8 + 32;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-        return INFINITY;
+        if (r > CL_FLT_MAX) return INFINITY;
 
         return r;
     }
 
     // Deal with FTZ for gCheckTininessBeforeRounding
-    if( exponent < (gCheckTininessBeforeRounding - 127) )
-        return 0.0f;
+    if (exponent < (gCheckTininessBeforeRounding - 127)) return 0.0f;
 
-    if( exponent == -127 ) // only happens for machines that check tininess after rounding
-        p = (p&1) | (p>>1);
+    if (exponent
+        == -127) // only happens for machines that check tininess after rounding
+        p = (p & 1) | (p >> 1);
     else
-        p &= 0x7fffffffffffffffULL;     // normal: leading bit is implicit. Remove it.
+        p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                    // it.
 
     cl_ulong q = p;
 
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     // put a representation of the residual bits into hi
-    q <<= (64-shift);
+    q <<= (64 - shift);
 
-    //round to nearest, ties to even  based on the unused portion of p
-    if( q > 0x8000000000000000ULL )
+    // round to nearest, ties to even  based on the unused portion of p
+    if (q > 0x8000000000000000ULL)
         u.u++;
-    else if( q == 0x8000000000000000ULL )
+    else if (q == 0x8000000000000000ULL)
         u.u += u.u & 1U;
 
     // Deal with FTZ for ! gCheckTininessBeforeRounding
-    if( 0 == (u.u & 0x7f800000U )  )
-        return 0.0f;
+    if (0 == (u.u & 0x7f800000U)) return 0.0f;
 
     return u.d;
 }
 
 
 // IEEE-754 round toward zero.
-static float round_toward_zero_float( cl_ulong p, int exponent );
-static float round_toward_zero_float( cl_ulong p, int exponent )
+static float round_toward_zero_float(cl_ulong p, int exponent);
+static float round_toward_zero_float(cl_ulong p, int exponent)
 {
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-            return CL_FLT_MAX;
+        if (r > CL_FLT_MAX) return CL_FLT_MAX;
 
         return r;
     }
 
-    if( exponent <= -149 )
-        return 0.0f;
+    if (exponent <= -149) return 0.0f;
 
-    //Figure out which bits go where
+    // Figure out which bits go where
     int shift = 8 + 32;
-    if( exponent < -126 )
+    if (exponent < -126)
     {
-        shift -= 126 + exponent;                    // subnormal: shift is not 52
-        exponent = -127;                            //            set exponent to 0
+        shift -= 126 + exponent; // subnormal: shift is not 52
+        exponent = -127; //            set exponent to 0
     }
     else
-        p &= 0x7fffffffffffffffULL;                 // normal: leading bit is implicit. Remove it.
+        p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                    // it.
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     return u.d;
 }
 
-static float round_toward_zero_float_ftz( cl_ulong p, int exponent );
-static float round_toward_zero_float_ftz( cl_ulong p, int exponent )
+static float round_toward_zero_float_ftz(cl_ulong p, int exponent);
+static float round_toward_zero_float_ftz(cl_ulong p, int exponent)
 {
     extern int gCheckTininessBeforeRounding;
 
-    union{ cl_uint u; cl_float d;} u;
+    union {
+        cl_uint u;
+        cl_float d;
+    } u;
     int shift = 8 + 32;
 
     // If mantissa is zero, return 0.0f
     if (p == 0) return 0.0f;
 
     // edges
-    if( exponent > 127 )
+    if (exponent > 127)
     {
-        volatile float r = exponent * CL_FLT_MAX;       // signal overflow
+        volatile float r = exponent * CL_FLT_MAX; // signal overflow
 
         // attempt to fool the compiler into not optimizing the above line away
-        if( r > CL_FLT_MAX )
-            return CL_FLT_MAX;
+        if (r > CL_FLT_MAX) return CL_FLT_MAX;
 
         return r;
     }
 
     // Deal with FTZ for gCheckTininessBeforeRounding
-    if( exponent < -126 )
-        return 0.0f;
+    if (exponent < -126) return 0.0f;
 
-    cl_ulong q = p &= 0x7fffffffffffffffULL;     // normal: leading bit is implicit. Remove it.
+    cl_ulong q = p &=
+        0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it.
 
     // Assemble the double (round toward zero)
-    u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23);
+    u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23);
 
     // put a representation of the residual bits into hi
-    q <<= (64-shift);
+    q <<= (64 - shift);
 
     return u.d;
 }
 
 // Subtract two significands.
-static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC );
-static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC )
+static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC);
+static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC)
 {
     cl_ulong carry;
-    p = sub_carry( *c, p, &carry );
+    p = sub_carry(*c, p, &carry);
 
-    if( carry )
+    if (carry)
     {
         *signC ^= 0x80000000U;
         p = -p;
     }
 
     // normalize
-    if( p )
+    if (p)
     {
         int shift = 32;
         cl_ulong test = 1ULL << 32;
-        while( 0 == (p & 0x8000000000000000ULL))
+        while (0 == (p & 0x8000000000000000ULL))
         {
-            if( p < test )
+            if (p < test)
             {
                 p <<= shift;
                 *expC = *expC - shift;
@@ -460,49 +484,60 @@ static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC )
     {
         // zero result.
         *expC = -200;
-        *signC = 0;     // IEEE rules say a - a = +0 for all rounding modes except -inf
+        *signC =
+            0; // IEEE rules say a - a = +0 for all rounding modes except -inf
     }
 
     *c = p;
 }
 
 
-float reference_fma( float a, float b, float c, int shouldFlush )
+float reference_fma(float a, float b, float c, int shouldFlush)
 {
     static const cl_uint kMSB = 0x80000000U;
 
     // Make bits accessible
-    union{ cl_uint u; cl_float d; } ua; ua.d = a;
-    union{ cl_uint u; cl_float d; } ub; ub.d = b;
-    union{ cl_uint u; cl_float d; } uc; uc.d = c;
+    union {
+        cl_uint u;
+        cl_float d;
+    } ua;
+    ua.d = a;
+    union {
+        cl_uint u;
+        cl_float d;
+    } ub;
+    ub.d = b;
+    union {
+        cl_uint u;
+        cl_float d;
+    } uc;
+    uc.d = c;
 
     // deal with Nans, infinities and zeros
-    if( isnan( a ) || isnan( b ) || isnan(c)    ||
-        isinf( a ) || isinf( b ) || isinf(c)    ||
-        0 == ( ua.u & ~kMSB)                ||  // a == 0, defeat host FTZ behavior
-        0 == ( ub.u & ~kMSB)                ||  // b == 0, defeat host FTZ behavior
-        0 == ( uc.u & ~kMSB)                )   // c == 0, defeat host FTZ behavior
+    if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
+        || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
+        0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
+        0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
     {
         FPU_mode_type oldMode;
         RoundingMode oldRoundMode = kRoundToNearestEven;
-        if( isinf( c ) && !isinf(a) && !isinf(b) )
-            return (c + a) + b;
+        if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
 
-        if (gIsInRTZMode)
-            oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
 
-        memset( &oldMode, 0, sizeof( oldMode ) );
-        if( shouldFlush )
-            ForceFTZ( &oldMode );
+        memset(&oldMode, 0, sizeof(oldMode));
+        if (shouldFlush) ForceFTZ(&oldMode);
 
-        a = (float) reference_multiply( a, b );    // some risk that the compiler will insert a non-compliant fma here on some platforms.
-        a = (float) reference_add( a, c );           // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
+        a = (float)reference_multiply(
+            a, b); // some risk that the compiler will insert a non-compliant
+                   // fma here on some platforms.
+        a = (float)reference_add(
+            a,
+            c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
 
-        if( shouldFlush )
-            RestoreFPState( &oldMode );
+        if (shouldFlush) RestoreFPState(&oldMode);
 
-        if( gIsInRTZMode )
-            set_round(oldRoundMode, kfloat);
+        if (gIsInRTZMode) set_round(oldRoundMode, kfloat);
         return a;
     }
 
@@ -510,67 +545,70 @@ float reference_fma( float a, float b, float c, int shouldFlush )
     //   exponent is a standard unbiased signed integer
     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
     cl_uint mantA, mantB, mantC;
-    int expA = extractf( a, &mantA );
-    int expB = extractf( b, &mantB );
-    int expC = extractf( c, &mantC );
-    cl_uint signC = uc.u & kMSB;                // We'll need the sign bit of C later to decide if we are adding or subtracting
+    int expA = extractf(a, &mantA);
+    int expB = extractf(b, &mantB);
+    int expC = extractf(c, &mantC);
+    cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide
+                                 // if we are adding or subtracting
 
-// exact product of A and B
+    // exact product of A and B
     int exponent = expA + expB;
     cl_uint sign = (ua.u ^ ub.u) & kMSB;
-    cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB;
+    cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
 
     // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
-    //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication
-    if( 0 == (0x8000000000000000ULL & product) )
+    //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact
+    //  that we got another power of two from the multiplication
+    if (0 == (0x8000000000000000ULL & product))
         product <<= 1;
     else
-        exponent++;         // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased.
+        exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our
+                    // exponent increased.
 
-//infinite precision add
-    cl_ulong addend = (cl_ulong) mantC << 32;
-    if( exponent >= expC )
+    // infinite precision add
+    cl_ulong addend = (cl_ulong)mantC << 32;
+    if (exponent >= expC)
     {
         // Shift C relative to the product so that their exponents match
-        if( exponent > expC )
-            shift_right_sticky_64( &addend, exponent - expC );
+        if (exponent > expC) shift_right_sticky_64(&addend, exponent - expC);
 
         // Add
-        if( sign ^ signC )
-            sub64( &product, addend, &sign, &exponent );
+        if (sign ^ signC)
+            sub64(&product, addend, &sign, &exponent);
         else
-            add64( &product, addend, &exponent );
+            add64(&product, addend, &exponent);
     }
     else
     {
         // Shift the product relative to C so that their exponents match
-        shift_right_sticky_64( &product, expC - exponent );
+        shift_right_sticky_64(&product, expC - exponent);
 
         // add
-        if( sign ^ signC )
-            sub64( &addend, product, &signC, &expC );
+        if (sign ^ signC)
+            sub64(&addend, product, &signC, &expC);
         else
-            add64( &addend, product, &expC );
+            add64(&addend, product, &expC);
 
         product = addend;
         exponent = expC;
         sign = signC;
     }
 
-    // round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c.
+    // round to IEEE result -- we do not do flushing to zero here. That part is
+    // handled manually in ternary.c.
     if (gIsInRTZMode)
     {
-        if( shouldFlush )
-            ua.d = round_toward_zero_float_ftz( product, exponent);
+        if (shouldFlush)
+            ua.d = round_toward_zero_float_ftz(product, exponent);
         else
-            ua.d = round_toward_zero_float( product, exponent);
+            ua.d = round_toward_zero_float(product, exponent);
     }
     else
     {
-        if( shouldFlush )
-            ua.d = round_to_nearest_even_float_ftz( product, exponent);
+        if (shouldFlush)
+            ua.d = round_to_nearest_even_float_ftz(product, exponent);
         else
-            ua.d = round_to_nearest_even_float( product, exponent);
+            ua.d = round_to_nearest_even_float(product, exponent);
     }
 
     // Set the sign
@@ -579,35 +617,36 @@ float reference_fma( float a, float b, float c, int shouldFlush )
     return ua.d;
 }
 
-double reference_relaxed_exp10( double x)
+double reference_relaxed_exp10(double x) { return reference_exp10(x); }
+
+double reference_exp10(double x)
 {
-  return reference_exp10(x);
+    return reference_exp2(x * HEX_DBL(+, 1, a934f0979a371, +, 1));
 }
 
-double reference_exp10( double x) {   return reference_exp2( x * HEX_DBL( +, 1, a934f0979a371, +, 1 ) );    }
 
-
-int   reference_ilogb( double x )
+int reference_ilogb(double x)
 {
     extern int gDeviceILogb0, gDeviceILogbNaN;
-    union { cl_double f; cl_ulong u;} u;
+    union {
+        cl_double f;
+        cl_ulong u;
+    } u;
 
-    u.f = (float) x;
-    cl_int exponent = (cl_int) (u.u >> 52) & 0x7ff;
-    if( exponent == 0x7ff )
+    u.f = (float)x;
+    cl_int exponent = (cl_int)(u.u >> 52) & 0x7ff;
+    if (exponent == 0x7ff)
     {
-        if( u.u & 0x000fffffffffffffULL )
-            return gDeviceILogbNaN;
+        if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
 
         return CL_INT_MAX;
     }
 
-    if( exponent == 0 )
-    {   // deal with denormals
-        u.f = x * HEX_DBL( +, 1, 0, +, 64 );
-        exponent = (cl_int) (u.u >> 52) & 0x7ff;
-        if( exponent == 0 )
-            return gDeviceILogb0;
+    if (exponent == 0)
+    { // deal with denormals
+        u.f = x * HEX_DBL(+, 1, 0, +, 64);
+        exponent = (cl_int)(u.u >> 52) & 0x7ff;
+        if (exponent == 0) return gDeviceILogb0;
 
         return exponent - (1023 + 64);
     }
@@ -615,220 +654,208 @@ int   reference_ilogb( double x )
     return exponent - 1023;
 }
 
-double reference_nan( cl_uint x )
+double reference_nan(cl_uint x)
 {
-    union{ cl_uint u; cl_float f; }u;
+    union {
+        cl_uint u;
+        cl_float f;
+    } u;
     u.u = x | 0x7fc00000U;
-    return (double) u.f;
+    return (double)u.f;
 }
 
-double reference_maxmag( double x, double y )
+double reference_maxmag(double x, double y)
 {
     double fabsx = fabs(x);
     double fabsy = fabs(y);
 
-    if( fabsx < fabsy )
-        return y;
+    if (fabsx < fabsy) return y;
 
-    if( fabsy < fabsx )
-        return x;
+    if (fabsy < fabsx) return x;
 
-    return reference_fmax( x, y );
+    return reference_fmax(x, y);
 }
 
-double reference_minmag( double x, double y )
+double reference_minmag(double x, double y)
 {
     double fabsx = fabs(x);
     double fabsy = fabs(y);
 
-    if( fabsx > fabsy )
-        return y;
+    if (fabsx > fabsy) return y;
 
-    if( fabsy > fabsx )
-        return x;
+    if (fabsy > fabsx) return x;
 
-    return reference_fmin( x, y );
+    return reference_fmin(x, y);
 }
 
-//double my_nextafter( double x, double y ){  return (double) nextafterf( (float) x, (float) y ); }
+// double my_nextafter( double x, double y ){  return (double) nextafterf(
+// (float) x, (float) y ); }
 
-double reference_relaxed_mad( double a, double b, double c)
+double reference_relaxed_mad(double a, double b, double c)
 {
-  return ((float) a )* ((float) b) + (float) c;
+    return ((float)a) * ((float)b) + (float)c;
 }
 
-double reference_mad( double a, double b, double c )
-{
-    return a * b + c;
-}
+double reference_mad(double a, double b, double c) { return a * b + c; }
 
-double reference_recip( double x) {   return 1.0 / x; }
-double reference_rootn( double x, int i )
+double reference_recip(double x) { return 1.0 / x; }
+double reference_rootn(double x, int i)
 {
 
-    //rootn ( x, 0 )  returns a NaN.
-    if( 0 == i )
-        return cl_make_nan();
+    // rootn ( x, 0 )  returns a NaN.
+    if (0 == i) return cl_make_nan();
 
-    //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-    if( x < 0 && 0 == (i&1) )
-        return cl_make_nan();
+    // rootn ( x, n )  returns a NaN for x < 0 and n is even.
+    if (x < 0 && 0 == (i & 1)) return cl_make_nan();
 
-    if( x == 0.0 )
+    if (x == 0.0)
     {
-        switch( i & 0x80000001 )
+        switch (i & 0x80000001)
         {
-            //rootn ( +-0,  n ) is +0 for even n > 0.
-            case 0:
-                return 0.0f;
+            // rootn ( +-0,  n ) is +0 for even n > 0.
+            case 0: return 0.0f;
 
-            //rootn ( +-0,  n ) is +-0 for odd n > 0.
-            case 1:
-                return x;
+            // rootn ( +-0,  n ) is +-0 for odd n > 0.
+            case 1: return x;
 
-            //rootn ( +-0,  n ) is +inf for even n < 0.
-            case 0x80000000:
-                return INFINITY;
+            // rootn ( +-0,  n ) is +inf for even n < 0.
+            case 0x80000000: return INFINITY;
 
-            //rootn ( +-0,  n ) is +-inf for odd n < 0.
-            case 0x80000001:
-                return copysign(INFINITY, x);
+            // rootn ( +-0,  n ) is +-inf for odd n < 0.
+            case 0x80000001: return copysign(INFINITY, x);
         }
     }
 
     double sign = x;
     x = reference_fabs(x);
-    x = reference_exp2( reference_log2(x) / (double) i );
-    return reference_copysignd( x, sign );
+    x = reference_exp2(reference_log2(x) / (double)i);
+    return reference_copysignd(x, sign);
 }
 
-double reference_rsqrt( double x) {   return 1.0 / reference_sqrt(x);   }
-//double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); }
-double reference_sinpi( double x)
+double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); }
+// double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); }
+double reference_sinpi(double x)
 {
     double r = reduce1(x);
 
     // reduce to [-0.5, 0.5]
-    if( r < -0.5 )
+    if (r < -0.5)
         r = -1 - r;
-    else if ( r > 0.5 )
+    else if (r > 0.5)
         r = 1 - r;
 
     // sinPi zeros have the same sign as x
-    if( r == 0.0 )
-        return reference_copysignd(0.0, x);
+    if (r == 0.0) return reference_copysignd(0.0, x);
 
-    return reference_sin( r * M_PI );
+    return reference_sin(r * M_PI);
 }
 
 double reference_relaxed_sinpi(double x) { return reference_sinpi(x); }
 
-double reference_tanpi( double x)
+double reference_tanpi(double x)
 {
     // set aside the sign  (allows us to preserve sign of -0)
-    double sign = reference_copysignd( 1.0, x);
+    double sign = reference_copysignd(1.0, x);
     double z = reference_fabs(x);
 
     // if big and even  -- caution: only works if x only has single precision
-    if( z >= HEX_DBL( +, 1, 0, +, 24 ) )
+    if (z >= HEX_DBL(+, 1, 0, +, 24))
     {
-        if( z == INFINITY )
-            return x - x;       // nan
+        if (z == INFINITY) return x - x; // nan
 
-        return reference_copysignd( 0.0, x);   // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
+        return reference_copysignd(
+            0.0, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
     }
 
     // reduce to the range [ -0.5, 0.5 ]
-    double nearest = reference_rint( z );     // round to nearest even places n + 0.5 values in the right place for us
-    int i = (int) nearest;          // test above against 0x1.0p24 avoids overflow here
+    double nearest = reference_rint(z); // round to nearest even places n + 0.5
+                                        // values in the right place for us
+    int i = (int)nearest; // test above against 0x1.0p24 avoids overflow here
     z -= nearest;
 
-    //correction for odd integer x for the right sign of zero
-    if( (i&1) && z == 0.0 )
-        sign = -sign;
+    // correction for odd integer x for the right sign of zero
+    if ((i & 1) && z == 0.0) sign = -sign;
 
     // track changes to the sign
-    sign *= reference_copysignd(1.0, z);       // really should just be an xor
-    z = reference_fabs(z);                    // remove the sign again
+    sign *= reference_copysignd(1.0, z); // really should just be an xor
+    z = reference_fabs(z); // remove the sign again
 
     // reduce once more
-    // If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly
-    if( z > 0.25 )
+    // If we don't do this, rounding error in z * M_PI will cause us not to
+    // return infinities properly
+    if (z > 0.25)
     {
         z = 0.5 - z;
-        return sign / reference_tan( z * M_PI );      // use system tan to get the right result
+        return sign
+            / reference_tan(z * M_PI); // use system tan to get the right result
     }
 
     //
-    return sign * reference_tan( z * M_PI );          // use system tan to get the right result
+    return sign
+        * reference_tan(z * M_PI); // use system tan to get the right result
 }
 
-double reference_pown( double x, int i) { return reference_pow( x, (double) i ); }
-double reference_powr( double x, double y )
+double reference_pown(double x, int i) { return reference_pow(x, (double)i); }
+double reference_powr(double x, double y)
 {
-    //powr ( x, y ) returns NaN for x < 0.
-    if( x < 0.0 )
-        return cl_make_nan();
+    // powr ( x, y ) returns NaN for x < 0.
+    if (x < 0.0) return cl_make_nan();
 
-    //powr ( x, NaN ) returns the NaN for x >= 0.
-    //powr ( NaN, y ) returns the NaN.
-    if( isnan(x) || isnan(y) )
-        return x + y;       // Note: behavior different here than for pow(1,NaN), pow(NaN, 0)
+    // powr ( x, NaN ) returns the NaN for x >= 0.
+    // powr ( NaN, y ) returns the NaN.
+    if (isnan(x) || isnan(y))
+        return x + y; // Note: behavior different here than for pow(1,NaN),
+                      // pow(NaN, 0)
 
-    if( x == 1.0 )
+    if (x == 1.0)
     {
-        //powr ( +1, +-inf ) returns NaN.
-        if( reference_fabs(y) == INFINITY )
-            return cl_make_nan();
+        // powr ( +1, +-inf ) returns NaN.
+        if (reference_fabs(y) == INFINITY) return cl_make_nan();
 
-        //powr ( +1, y ) is 1 for finite y.    (NaN handled above)
+        // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
         return 1.0;
     }
 
-    if( y == 0.0 )
+    if (y == 0.0)
     {
-        //powr ( +inf, +-0 ) returns NaN.
-        //powr ( +-0, +-0 ) returns NaN.
-        if( x == 0.0 || x == INFINITY )
-            return cl_make_nan();
+        // powr ( +inf, +-0 ) returns NaN.
+        // powr ( +-0, +-0 ) returns NaN.
+        if (x == 0.0 || x == INFINITY) return cl_make_nan();
 
-        //powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already handled above)
+        // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
+        // handled above)
         return 1.0;
     }
 
-    if( x == 0.0 )
+    if (x == 0.0)
     {
-        //powr ( +-0, -inf) is +inf.
-        //powr ( +-0, y ) is +inf for finite y < 0.
-        if( y < 0.0 )
-            return INFINITY;
+        // powr ( +-0, -inf) is +inf.
+        // powr ( +-0, y ) is +inf for finite y < 0.
+        if (y < 0.0) return INFINITY;
 
-        //powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
+        // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
         return 0.0;
     }
 
     // x = +inf
-    if( isinf(x) )
+    if (isinf(x))
     {
-        if( y < 0 )
-            return 0;
+        if (y < 0) return 0;
         return INFINITY;
     }
 
     double fabsx = reference_fabs(x);
     double fabsy = reference_fabs(y);
 
-    //y = +-inf cases
-    if( isinf(fabsy) )
+    // y = +-inf cases
+    if (isinf(fabsy))
     {
-        if( y < 0 )
+        if (y < 0)
         {
-            if( fabsx < 1 )
-                return INFINITY;
+            if (fabsx < 1) return INFINITY;
             return 0;
         }
-        if( fabsx < 1 )
-            return 0;
+        if (fabsx < 1) return 0;
         return INFINITY;
     }
 
@@ -840,169 +867,212 @@ double reference_powr( double x, double y )
     return result;
 }
 
-double reference_fract( double x, double *ip )
+double reference_fract(double x, double *ip)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *ip = cl_make_nan();
         return cl_make_nan();
     }
 
     float i;
-    float f = modff((float) x, &i );
-    if( f < 0.0 )
+    float f = modff((float)x, &i);
+    if (f < 0.0)
     {
         f = 1.0f + f;
         i -= 1.0f;
-        if( f == 1.0f )
-            f = HEX_FLT( +, 1, fffffe, -, 1 );
+        if (f == 1.0f) f = HEX_FLT(+, 1, fffffe, -, 1);
     }
     *ip = i;
     return f;
 }
 
 
-//double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); }
-double reference_add( double x, double y )
+// double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); }
+double reference_add(double x, double y)
 {
-    volatile float a = (float) x;
-    volatile float b = (float) y;
+    volatile float a = (float)x;
+    volatile float b = (float)y;
 
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
     // defeat x87
-    __m128 va = _mm_set_ss( (float) a );
-    __m128 vb = _mm_set_ss( (float) b );
-    va = _mm_add_ss( va, vb );
-    _mm_store_ss( (float*) &a, va );
+    __m128 va = _mm_set_ss((float)a);
+    __m128 vb = _mm_set_ss((float)b);
+    va = _mm_add_ss(va, vb);
+    _mm_store_ss((float *)&a, va);
 #elif defined(__PPC__)
-    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero.
-    // As such, the reference add with FTZ must be emulated in sw.
-    if (fpu_control & _FPU_MASK_NI) {
-      union{ cl_uint u; cl_float d; } ua; ua.d = a;
-      union{ cl_uint u; cl_float d; } ub; ub.d = b;
-      cl_uint mantA, mantB;
-      cl_ulong addendA, addendB, sum;
-      int expA = extractf( a, &mantA );
-      int expB = extractf( b, &mantB );
-      cl_uint signA = ua.u & 0x80000000U;
-      cl_uint signB = ub.u & 0x80000000U;
+    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
+    // denorm's to zero. As such, the reference add with FTZ must be emulated in
+    // sw.
+    if (fpu_control & _FPU_MASK_NI)
+    {
+        union {
+            cl_uint u;
+            cl_float d;
+        } ua;
+        ua.d = a;
+        union {
+            cl_uint u;
+            cl_float d;
+        } ub;
+        ub.d = b;
+        cl_uint mantA, mantB;
+        cl_ulong addendA, addendB, sum;
+        int expA = extractf(a, &mantA);
+        int expB = extractf(b, &mantB);
+        cl_uint signA = ua.u & 0x80000000U;
+        cl_uint signB = ub.u & 0x80000000U;
 
-      // Force matching exponents if an operand is 0
-      if (a == 0.0f) {
-    expA = expB;
-      } else if (b == 0.0f) {
-    expB = expA;
-      }
+        // Force matching exponents if an operand is 0
+        if (a == 0.0f)
+        {
+            expA = expB;
+        }
+        else if (b == 0.0f)
+        {
+            expB = expA;
+        }
 
-      addendA = (cl_ulong)mantA << 32;
-      addendB = (cl_ulong)mantB << 32;
+        addendA = (cl_ulong)mantA << 32;
+        addendB = (cl_ulong)mantB << 32;
 
-      if (expA >= expB) {
-        // Shift B relative to the A so that their exponents match
-        if( expA > expB )
-      shift_right_sticky_64( &addendB, expA - expB );
+        if (expA >= expB)
+        {
+            // Shift B relative to the A so that their exponents match
+            if (expA > expB) shift_right_sticky_64(&addendB, expA - expB);
 
-        // add
-        if( signA ^ signB )
-      sub64( &addendA, addendB, &signA, &expA );
+            // add
+            if (signA ^ signB)
+                sub64(&addendA, addendB, &signA, &expA);
+            else
+                add64(&addendA, addendB, &expA);
+        }
         else
-      add64( &addendA, addendB, &expA );
-      } else  {
-        // Shift the A relative to B so that their exponents match
-        shift_right_sticky_64( &addendA, expB - expA );
+        {
+            // Shift the A relative to B so that their exponents match
+            shift_right_sticky_64(&addendA, expB - expA);
 
-        // add
-        if( signA ^ signB )
-      sub64( &addendB, addendA, &signB, &expB );
+            // add
+            if (signA ^ signB)
+                sub64(&addendB, addendA, &signB, &expB);
+            else
+                add64(&addendB, addendA, &expB);
+
+            addendA = addendB;
+            expA = expB;
+            signA = signB;
+        }
+
+        // round to IEEE result
+        if (gIsInRTZMode)
+        {
+            ua.d = round_toward_zero_float_ftz(addendA, expA);
+        }
         else
-      add64( &addendB, addendA, &expB );
-
-        addendA = addendB;
-        expA = expB;
-        signA = signB;
-      }
-
-      // round to IEEE result
-      if (gIsInRTZMode)    {
-    ua.d = round_toward_zero_float_ftz( addendA, expA );
-      } else {
-    ua.d = round_to_nearest_even_float_ftz( addendA, expA );
-      }
-      // Set the sign
-      ua.u |= signA;
-      a = ua.d;
-    } else {
-      a += b;
+        {
+            ua.d = round_to_nearest_even_float_ftz(addendA, expA);
+        }
+        // Set the sign
+        ua.u |= signA;
+        a = ua.d;
+    }
+    else
+    {
+        a += b;
     }
 #else
     a += b;
 #endif
-    return (double) a;
- }
+    return (double)a;
+}
 
 
-double reference_subtract( double x, double y )
+double reference_subtract(double x, double y)
 {
-    volatile float a = (float) x;
-    volatile float b = (float) y;
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
+    volatile float a = (float)x;
+    volatile float b = (float)y;
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
     // defeat x87
-    __m128 va = _mm_set_ss( (float) a );
-    __m128 vb = _mm_set_ss( (float) b );
-    va = _mm_sub_ss( va, vb );
-    _mm_store_ss( (float*) &a, va );
+    __m128 va = _mm_set_ss((float)a);
+    __m128 vb = _mm_set_ss((float)b);
+    va = _mm_sub_ss(va, vb);
+    _mm_store_ss((float *)&a, va);
 #else
     a -= b;
 #endif
     return a;
 }
 
-//double reference_divide( double x, double y ){ return (float) x / (float) y; }
-double reference_multiply( double x, double y)
+// double reference_divide( double x, double y ){ return (float) x / (float) y;
+// }
+double reference_multiply(double x, double y)
 {
-    volatile float a = (float) x;
-    volatile float b = (float) y;
-#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
+    volatile float a = (float)x;
+    volatile float b = (float)y;
+#if defined(__SSE__)                                                           \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
     // defeat x87
-    __m128 va = _mm_set_ss( (float) a );
-    __m128 vb = _mm_set_ss( (float) b );
-    va = _mm_mul_ss( va, vb );
-    _mm_store_ss( (float*) &a, va );
+    __m128 va = _mm_set_ss((float)a);
+    __m128 vb = _mm_set_ss((float)b);
+    va = _mm_mul_ss(va, vb);
+    _mm_store_ss((float *)&a, va);
 #elif defined(__PPC__)
-    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero.
-    // As such, the reference multiply with FTZ must be emulated in sw.
-    if (fpu_control & _FPU_MASK_NI) {
-      // extract exponent and mantissa
-      //   exponent is a standard unbiased signed integer
-      //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
-      union{ cl_uint u; cl_float d; } ua; ua.d = a;
-      union{ cl_uint u; cl_float d; } ub; ub.d = b;
-      cl_uint mantA, mantB;
-      int expA = extractf( a, &mantA );
-      int expB = extractf( b, &mantB );
+    // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
+    // denorm's to zero. As such, the reference multiply with FTZ must be
+    // emulated in sw.
+    if (fpu_control & _FPU_MASK_NI)
+    {
+        // extract exponent and mantissa
+        //   exponent is a standard unbiased signed integer
+        //   mantissa is a cl_uint, with leading non-zero bit positioned at the
+        //   MSB
+        union {
+            cl_uint u;
+            cl_float d;
+        } ua;
+        ua.d = a;
+        union {
+            cl_uint u;
+            cl_float d;
+        } ub;
+        ub.d = b;
+        cl_uint mantA, mantB;
+        int expA = extractf(a, &mantA);
+        int expB = extractf(b, &mantB);
 
-      // exact product of A and B
-      int exponent = expA + expB;
-      cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
-      cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB;
+        // exact product of A and B
+        int exponent = expA + expB;
+        cl_uint sign = (ua.u ^ ub.u) & 0x80000000U;
+        cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB;
 
-      // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
-      //  The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication
-      if( 0 == (0x8000000000000000ULL & product) )
-        product <<= 1;
-      else
-        exponent++;         // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased.
+        // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999..
+        //  The MSB might not be set. If so, fix that. Otherwise, reflect the
+        //  fact that we got another power of two from the multiplication
+        if (0 == (0x8000000000000000ULL & product))
+            product <<= 1;
+        else
+            exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then
+                        // our exponent increased.
 
-      // round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c.
-      if (gIsInRTZMode)    {
-    ua.d = round_toward_zero_float_ftz( product, exponent);
-      } else {
-    ua.d = round_to_nearest_even_float_ftz( product, exponent);
-      }
-      // Set the sign
-      ua.u |= sign;
-      a = ua.d;
-    } else {
-      a *= b;
+        // round to IEEE result -- we do not do flushing to zero here. That part
+        // is handled manually in ternary.c.
+        if (gIsInRTZMode)
+        {
+            ua.d = round_toward_zero_float_ftz(product, exponent);
+        }
+        else
+        {
+            ua.d = round_to_nearest_even_float_ftz(product, exponent);
+        }
+        // Set the sign
+        ua.u |= sign;
+        a = ua.d;
+    }
+    else
+    {
+        a *= b;
     }
 #else
     a *= b;
@@ -1022,7 +1092,7 @@ double reference_multiply( double x, double y)
 
     return (double) remquof( (float) x, (float) y, iptr );
 }*/
-double reference_lgamma_r( double x, int *signp )
+double reference_lgamma_r(double x, int *signp)
 {
     // This is not currently tested
     *signp = 0;
@@ -1030,81 +1100,93 @@ double reference_lgamma_r( double x, int *signp )
 }
 
 
-int reference_isequal( double x, double y ){ return x == y; }
-int reference_isfinite( double x ){ return 0 != isfinite(x); }
-int reference_isgreater( double x, double y ){ return x > y; }
-int reference_isgreaterequal( double x, double y ){ return x >= y; }
-int reference_isinf( double x ){ return 0 != isinf(x); }
-int reference_isless( double x, double y ){ return x < y; }
-int reference_islessequal( double x, double y ){ return x <= y; }
-int reference_islessgreater( double x, double y ){  return 0 != islessgreater( x, y ); }
-int reference_isnan( double x ){ return 0 != isnan( x ); }
-int reference_isnormal( double x ){ return 0 != isnormal( (float) x ); }
-int reference_isnotequal( double x, double y ){ return x != y; }
-int reference_isordered( double x, double y){ return x == x && y == y; }
-int reference_isunordered( double x, double y ){ return isnan(x) || isnan( y ); }
-int reference_signbit( float x ){ return 0 != signbit( x ); }
+int reference_isequal(double x, double y) { return x == y; }
+int reference_isfinite(double x) { return 0 != isfinite(x); }
+int reference_isgreater(double x, double y) { return x > y; }
+int reference_isgreaterequal(double x, double y) { return x >= y; }
+int reference_isinf(double x) { return 0 != isinf(x); }
+int reference_isless(double x, double y) { return x < y; }
+int reference_islessequal(double x, double y) { return x <= y; }
+int reference_islessgreater(double x, double y)
+{
+    return 0 != islessgreater(x, y);
+}
+int reference_isnan(double x) { return 0 != isnan(x); }
+int reference_isnormal(double x) { return 0 != isnormal((float)x); }
+int reference_isnotequal(double x, double y) { return x != y; }
+int reference_isordered(double x, double y) { return x == x && y == y; }
+int reference_isunordered(double x, double y) { return isnan(x) || isnan(y); }
+int reference_signbit(float x) { return 0 != signbit(x); }
 
 #if 1 // defined( _MSC_VER )
 
-//Missing functions for win32
+// Missing functions for win32
 
 
-float reference_copysign( float x, float y )
+float reference_copysign(float x, float y)
 {
-    union { float f; cl_uint u;} ux, uy;
-    ux.f = x; uy.f = y;
+    union {
+        float f;
+        cl_uint u;
+    } ux, uy;
+    ux.f = x;
+    uy.f = y;
     ux.u &= 0x7fffffffU;
     ux.u |= uy.u & 0x80000000U;
     return ux.f;
 }
 
 
-double reference_copysignd( double x, double y )
+double reference_copysignd(double x, double y)
 {
-    union { double f; cl_ulong u;} ux, uy;
-    ux.f = x; uy.f = y;
+    union {
+        double f;
+        cl_ulong u;
+    } ux, uy;
+    ux.f = x;
+    uy.f = y;
     ux.u &= 0x7fffffffffffffffULL;
     ux.u |= uy.u & 0x8000000000000000ULL;
     return ux.f;
 }
 
 
-double reference_round( double x )
+double reference_round(double x)
 {
     double absx = reference_fabs(x);
-    if( absx < 0.5 )
-        return reference_copysignd( 0.0, x );
+    if (absx < 0.5) return reference_copysignd(0.0, x);
 
-    if( absx < HEX_DBL( +, 1, 0, +, 53 ) )
-        x = reference_trunc( x + reference_copysignd( 0.5, x ) );
+    if (absx < HEX_DBL(+, 1, 0, +, 53))
+        x = reference_trunc(x + reference_copysignd(0.5, x));
 
     return x;
 }
 
-double reference_trunc( double x )
+double reference_trunc(double x)
 {
-    if( fabs(x) < HEX_DBL( +, 1, 0, +, 53 ) )
+    if (fabs(x) < HEX_DBL(+, 1, 0, +, 53))
     {
-        cl_long l = (cl_long) x;
+        cl_long l = (cl_long)x;
 
-        return reference_copysignd( (double) l, x );
+        return reference_copysignd((double)l, x);
     }
 
     return x;
 }
 
 #ifndef FP_ILOGB0
-    #define FP_ILOGB0   INT_MIN
+#define FP_ILOGB0 INT_MIN
 #endif
 
 #ifndef FP_ILOGBNAN
-    #define FP_ILOGBNAN   INT_MAX
+#define FP_ILOGBNAN INT_MAX
 #endif
 
 
-
-double reference_cbrt(double x){ return reference_copysignd( reference_pow( reference_fabs(x), 1.0/3.0 ), x ); }
+double reference_cbrt(double x)
+{
+    return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x);
+}
 
 /*
 double reference_scalbn(double x, int i)
@@ -1122,174 +1204,188 @@ double reference_scalbn(double x, int i)
 }
 */
 
-double reference_rint( double x )
+double reference_rint(double x)
 {
-    if( reference_fabs(x) < HEX_DBL( +, 1, 0, +, 52 )  )
+    if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52))
     {
-        double magic = reference_copysignd( HEX_DBL( +, 1, 0, +, 52 ), x );
+        double magic = reference_copysignd(HEX_DBL(+, 1, 0, +, 52), x);
         double rounded = (x + magic) - magic;
-        x = reference_copysignd( rounded, x );
+        x = reference_copysignd(rounded, x);
     }
 
     return x;
 }
 
-double reference_acosh( double x )
+double reference_acosh(double x)
 { // not full precision. Sufficient precision to cover float
-    if( isnan(x) )
-        return x + x;
+    if (isnan(x)) return x + x;
 
-    if( x < 1.0 )
-        return cl_make_nan();
+    if (x < 1.0) return cl_make_nan();
 
-    return reference_log( x + reference_sqrt(x + 1) * reference_sqrt(x-1) );
+    return reference_log(x + reference_sqrt(x + 1) * reference_sqrt(x - 1));
 }
 
-double reference_asinh( double x )
+double reference_asinh(double x)
 {
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if( isnan(x) || isinf(x) )
-        return x + x;
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (isnan(x) || isinf(x)) return x + x;
 
     double absx = reference_fabs(x);
-    if( absx < HEX_DBL( +, 1, 0, -, 28 ) )
-        return x;
+    if (absx < HEX_DBL(+, 1, 0, -, 28)) return x;
 
     double sign = reference_copysignd(1.0, x);
 
-    if( absx > HEX_DBL( +, 1, 0, +, 28 ) )
-        return sign * (reference_log( absx ) + 0.693147180559945309417232121458176568);    // log(2)
+    if (absx > HEX_DBL(+, 1, 0, +, 28))
+        return sign
+            * (reference_log(absx)
+               + 0.693147180559945309417232121458176568); // log(2)
 
-    if( absx > 2.0 )
-        return sign * reference_log( 2.0 * absx + 1.0 / (reference_sqrt( x * x + 1.0 ) + absx));
+    if (absx > 2.0)
+        return sign
+            * reference_log(2.0 * absx
+                            + 1.0 / (reference_sqrt(x * x + 1.0) + absx));
 
-    return sign * reference_log1p( absx + x*x / (1.0 + reference_sqrt(1.0 + x*x)));
+    return sign
+        * reference_log1p(absx + x * x / (1.0 + reference_sqrt(1.0 + x * x)));
 }
 
 
-double reference_atanh( double x )
+double reference_atanh(double x)
 {
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if( isnan(x)  )
-        return x + x;
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (isnan(x)) return x + x;
 
-    double signed_half = reference_copysignd( 0.5, x );
+    double signed_half = reference_copysignd(0.5, x);
     x = reference_fabs(x);
-    if( x > 1.0 )
-        return cl_make_nan();
+    if (x > 1.0) return cl_make_nan();
 
-    if( x < 0.5 )
-        return signed_half * reference_log1p( 2.0 * ( x + x*x / (1-x) ) );
+    if (x < 0.5)
+        return signed_half * reference_log1p(2.0 * (x + x * x / (1 - x)));
 
-    return signed_half * reference_log1p(2.0 * x / (1-x));
+    return signed_half * reference_log1p(2.0 * x / (1 - x));
 }
 
 double reference_relaxed_atan(double x) { return reference_atan(x); }
 
-double reference_relaxed_exp2( double x )
-{
-  return reference_exp2(x);
-}
+double reference_relaxed_exp2(double x) { return reference_exp2(x); }
 
-double reference_exp2( double x )
-{ // Note: only suitable for verifying single precision. Doesn't have range of a full double exp2 implementation.
-    if( x == 0.0 )
-        return 1.0;
+double reference_exp2(double x)
+{ // Note: only suitable for verifying single precision. Doesn't have range of a
+  // full double exp2 implementation.
+    if (x == 0.0) return 1.0;
 
     // separate x into fractional and integer parts
-    double i = reference_rint( x );        // round to nearest integer
+    double i = reference_rint(x); // round to nearest integer
 
-    if( i < -150 )
-        return 0.0;
+    if (i < -150) return 0.0;
 
-    if( i > 129 )
-        return INFINITY;
+    if (i > 129) return INFINITY;
 
-    double f = x - i;            // -0.5 <= f <= 0.5
+    double f = x - i; // -0.5 <= f <= 0.5
 
     // find exp2(f)
     // calculate as p(f) = (exp2(f)-1)/f
     //              exp2(f) = f * p(f) + 1
     // p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50
 
-    double p = 0.693147180560184539289 +
-               (0.240226506955902863183 +
-               (0.055504108656833424373 +
-               (0.009618129212846484796 +
-               (0.001333355902958566035 +
-               (0.000154034191902497930 +
-               (0.000015252317761038105 +
-               (0.000001326283129417092 + 0.000000102593187638680 * f)*f)*f)*f)*f)*f)*f)*f;
+    double p = 0.693147180560184539289
+        + (0.240226506955902863183
+           + (0.055504108656833424373
+              + (0.009618129212846484796
+                 + (0.001333355902958566035
+                    + (0.000154034191902497930
+                       + (0.000015252317761038105
+                          + (0.000001326283129417092
+                             + 0.000000102593187638680 * f)
+                              * f)
+                           * f)
+                        * f)
+                     * f)
+                  * f)
+               * f)
+            * f;
     f *= p;
     f += 1.0;
 
     // scale by 2 ** i
-    union{ cl_ulong u; double d; } u;
-    int exponent = (int) i + 1023;
-    u.u = (cl_ulong) exponent << 52;
+    union {
+        cl_ulong u;
+        double d;
+    } u;
+    int exponent = (int)i + 1023;
+    u.u = (cl_ulong)exponent << 52;
 
     return f * u.d;
 }
 
 
-double reference_expm1( double x )
-{ // Note: only suitable for verifying single precision. Doesn't have range of a full double expm1 implementation. It is only accurate to 47 bits or less.
+double reference_expm1(double x)
+{ // Note: only suitable for verifying single precision. Doesn't have range of a
+  // full double expm1 implementation. It is only accurate to 47 bits or less.
 
     // early out for small numbers and NaNs
-    if( ! (reference_fabs(x) > HEX_DBL( +, 1, 0, -, 24 )) )
-        return x;
+    if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 24))) return x;
 
     // early out for large negative numbers
-    if( x < -130.0 )
-        return -1.0;
+    if (x < -130.0) return -1.0;
 
     // early out for large positive numbers
-    if( x > 100.0 )
-        return INFINITY;
+    if (x > 100.0) return INFINITY;
 
     // separate x into fractional and integer parts
-    double i = reference_rint( x );        // round to nearest integer
-    double f = x - i;            // -0.5 <= f <= 0.5
+    double i = reference_rint(x); // round to nearest integer
+    double f = x - i; // -0.5 <= f <= 0.5
 
     // reduce f to the range -0.0625 .. f.. 0.0625
-    int index = (int) (f * 16.0) + 8;       // 0...16
+    int index = (int)(f * 16.0) + 8; // 0...16
 
-    static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625,
-                                           0.0,
-                                          +0.0625, +0.125, +0.1875, +0.25, +0.3125, +0.375, +0.4375, +0.5  };
+    static const double reduction[17] = { -0.5,  -0.4375, -0.375, -0.3125,
+                                          -0.25, -0.1875, -0.125, -0.0625,
+                                          0.0,   +0.0625, +0.125, +0.1875,
+                                          +0.25, +0.3125, +0.375, +0.4375,
+                                          +0.5 };
 
 
     // exponentials[i] = expm1(reduction[i])
-    static const double exponentials[17] = {    HEX_DBL( -, 1, 92e9a0720d3ec, -, 2 ),    HEX_DBL( -, 1, 6adb1cd9205ee, -, 2 ),
-                                                HEX_DBL( -, 1, 40373d42ce2e3, -, 2 ),    HEX_DBL( -, 1, 12d35a41ba104, -, 2 ),
-                                                HEX_DBL( -, 1, c5041854df7d4, -, 3 ),    HEX_DBL( -, 1, 5e25fb4fde211, -, 3 ),
-                                                HEX_DBL( -, 1, e14aed893eef4, -, 4 ),    HEX_DBL( -, 1, f0540438fd5c3, -, 5 ),
-                                                HEX_DBL( +, 0, 0,             +, 0 ),
-                                                HEX_DBL( +, 1, 082b577d34ed8, -, 4 ),    HEX_DBL( +, 1, 10b022db7ae68, -, 3 ),
-                                                HEX_DBL( +, 1, a65c0b85ac1a9, -, 3 ),    HEX_DBL( +, 1, 22d78f0fa061a, -, 2 ),
-                                                HEX_DBL( +, 1, 77a45d8117fd5, -, 2 ),    HEX_DBL( +, 1, d1e944f6fbdaa, -, 2 ),
-                                                HEX_DBL( +, 1, 190048ef6002,  -, 1 ),    HEX_DBL( +, 1, 4c2531c3c0d38, -, 1 ),
-                                            };
+    static const double exponentials[17] = {
+        HEX_DBL(-, 1, 92e9a0720d3ec, -, 2),
+        HEX_DBL(-, 1, 6adb1cd9205ee, -, 2),
+        HEX_DBL(-, 1, 40373d42ce2e3, -, 2),
+        HEX_DBL(-, 1, 12d35a41ba104, -, 2),
+        HEX_DBL(-, 1, c5041854df7d4, -, 3),
+        HEX_DBL(-, 1, 5e25fb4fde211, -, 3),
+        HEX_DBL(-, 1, e14aed893eef4, -, 4),
+        HEX_DBL(-, 1, f0540438fd5c3, -, 5),
+        HEX_DBL(+, 0, 0, +, 0),
+        HEX_DBL(+, 1, 082b577d34ed8, -, 4),
+        HEX_DBL(+, 1, 10b022db7ae68, -, 3),
+        HEX_DBL(+, 1, a65c0b85ac1a9, -, 3),
+        HEX_DBL(+, 1, 22d78f0fa061a, -, 2),
+        HEX_DBL(+, 1, 77a45d8117fd5, -, 2),
+        HEX_DBL(+, 1, d1e944f6fbdaa, -, 2),
+        HEX_DBL(+, 1, 190048ef6002, -, 1),
+        HEX_DBL(+, 1, 4c2531c3c0d38, -, 1),
+    };
 
 
     f -= reduction[index];
@@ -1297,223 +1393,368 @@ double reference_expm1( double x )
     // find expm1(f)
     // calculate as p(f) = (exp(f)-1)/f
     //              expm1(f) = f * p(f)
-    // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over the range +- 0.0625
-    double p = 0.999999999999998001599 +
-               (0.499999999999839628284 +
-               (0.166666666672817459505 +
-               (0.041666666612283048687 +
-               (0.008333330214567431435 +
-               (0.001389005319303770070 + 0.000198833381525156667 * f)*f)*f)*f)*f)*f;
+    // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over
+    // the range +- 0.0625
+    double p = 0.999999999999998001599
+        + (0.499999999999839628284
+           + (0.166666666672817459505
+              + (0.041666666612283048687
+                 + (0.008333330214567431435
+                    + (0.001389005319303770070 + 0.000198833381525156667 * f)
+                        * f)
+                     * f)
+                  * f)
+               * f)
+            * f;
     f *= p; // expm1( reduced f )
 
     // expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1
-    //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index] + 1 -1
-    //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index]
-    f +=  exponentials[index] + f * exponentials[index];
+    //          =  exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) +
+    //          exponentials[index] + 1 -1 =  exmp1( reduced_f) *
+    //          exponentials[index] + exmp1( reduced_f) + exponentials[index]
+    f += exponentials[index] + f * exponentials[index];
 
     // scale by e ** i
-    int exponent = (int) i;
-    if( 0 == exponent )
-        return f;       // precise answer for x near 1
+    int exponent = (int)i;
+    if (0 == exponent) return f; // precise answer for x near 1
 
     // table of e**(i-150)
-    static const double exp_table[128+150+1] =
-    {
-        HEX_DBL( +, 1, 82e16284f5ec5, -, 217 ),    HEX_DBL( +, 1, 06e9996332ba1, -, 215 ),
-        HEX_DBL( +, 1, 6555cb289e44b, -, 214 ),    HEX_DBL( +, 1, e5ab364643354, -, 213 ),
-        HEX_DBL( +, 1, 4a0bd18e64df7, -, 211 ),    HEX_DBL( +, 1, c094499cc578e, -, 210 ),
-        HEX_DBL( +, 1, 30d759323998c, -, 208 ),    HEX_DBL( +, 1, 9e5278ab1d4cf, -, 207 ),
-        HEX_DBL( +, 1, 198fa3f30be25, -, 205 ),    HEX_DBL( +, 1, 7eae636d6144e, -, 204 ),
-        HEX_DBL( +, 1, 040f1036f4863, -, 202 ),    HEX_DBL( +, 1, 6174e477a895f, -, 201 ),
-        HEX_DBL( +, 1, e065b82dd95a,  -, 200 ),    HEX_DBL( +, 1, 4676be491d129, -, 198 ),
-        HEX_DBL( +, 1, bbb5da5f7c823, -, 197 ),    HEX_DBL( +, 1, 2d884eef5fdcb, -, 195 ),
-        HEX_DBL( +, 1, 99d3397ab8371, -, 194 ),    HEX_DBL( +, 1, 1681497ed15b3, -, 192 ),
-        HEX_DBL( +, 1, 7a870f597fdbd, -, 191 ),    HEX_DBL( +, 1, 013c74edba307, -, 189 ),
-        HEX_DBL( +, 1, 5d9ec4ada7938, -, 188 ),    HEX_DBL( +, 1, db2edfd20fa7c, -, 187 ),
-        HEX_DBL( +, 1, 42eb9f39afb0b, -, 185 ),    HEX_DBL( +, 1, b6e4f282b43f4, -, 184 ),
-        HEX_DBL( +, 1, 2a42764857b19, -, 182 ),    HEX_DBL( +, 1, 9560792d19314, -, 181 ),
-        HEX_DBL( +, 1, 137b6ce8e052c, -, 179 ),    HEX_DBL( +, 1, 766b45dd84f18, -, 178 ),
-        HEX_DBL( +, 1, fce362fe6e7d,  -, 177 ),    HEX_DBL( +, 1, 59d34dd8a5473, -, 175 ),
-        HEX_DBL( +, 1, d606847fc727a, -, 174 ),    HEX_DBL( +, 1, 3f6a58b795de3, -, 172 ),
-        HEX_DBL( +, 1, b2216c6efdac1, -, 171 ),    HEX_DBL( +, 1, 2705b5b153fb8, -, 169 ),
-        HEX_DBL( +, 1, 90fa1509bd50d, -, 168 ),    HEX_DBL( +, 1, 107df698da211, -, 166 ),
-        HEX_DBL( +, 1, 725ae6e7b9d35, -, 165 ),    HEX_DBL( +, 1, f75d6040aeff6, -, 164 ),
-        HEX_DBL( +, 1, 56126259e093c, -, 162 ),    HEX_DBL( +, 1, d0ec7df4f7bd4, -, 161 ),
-        HEX_DBL( +, 1, 3bf2cf6722e46, -, 159 ),    HEX_DBL( +, 1, ad6b22f55db42, -, 158 ),
-        HEX_DBL( +, 1, 23d1f3e5834a,  -, 156 ),    HEX_DBL( +, 1, 8c9feab89b876, -, 155 ),
-        HEX_DBL( +, 1, 0d88cf37f00dd, -, 153 ),    HEX_DBL( +, 1, 6e55d2bf838a7, -, 152 ),
-        HEX_DBL( +, 1, f1e6b68529e33, -, 151 ),    HEX_DBL( +, 1, 525be4e4e601d, -, 149 ),
-        HEX_DBL( +, 1, cbe0a45f75eb1, -, 148 ),    HEX_DBL( +, 1, 3884e838aea68, -, 146 ),
-        HEX_DBL( +, 1, a8c1f14e2af5d, -, 145 ),    HEX_DBL( +, 1, 20a717e64a9bd, -, 143 ),
-        HEX_DBL( +, 1, 8851d84118908, -, 142 ),    HEX_DBL( +, 1, 0a9bdfb02d24,  -, 140 ),
-        HEX_DBL( +, 1, 6a5bea046b42e, -, 139 ),    HEX_DBL( +, 1, ec7f3b269efa8, -, 138 ),
-        HEX_DBL( +, 1, 4eafb87eab0f2, -, 136 ),    HEX_DBL( +, 1, c6e2d05bbc,    -, 135 ),
-        HEX_DBL( +, 1, 35208867c2683, -, 133 ),    HEX_DBL( +, 1, a425b317eeacd, -, 132 ),
-        HEX_DBL( +, 1, 1d8508fa8246a, -, 130 ),    HEX_DBL( +, 1, 840fbc08fdc8a, -, 129 ),
-        HEX_DBL( +, 1, 07b7112bc1ffe, -, 127 ),    HEX_DBL( +, 1, 666d0dad2961d, -, 126 ),
-        HEX_DBL( +, 1, e726c3f64d0fe, -, 125 ),    HEX_DBL( +, 1, 4b0dc07cabf98, -, 123 ),
-        HEX_DBL( +, 1, c1f2daf3b6a46, -, 122 ),    HEX_DBL( +, 1, 31c5957a47de2, -, 120 ),
-        HEX_DBL( +, 1, 9f96445648b9f, -, 119 ),    HEX_DBL( +, 1, 1a6baeadb4fd1, -, 117 ),
-        HEX_DBL( +, 1, 7fd974d372e45, -, 116 ),    HEX_DBL( +, 1, 04da4d1452919, -, 114 ),
-        HEX_DBL( +, 1, 62891f06b345,  -, 113 ),    HEX_DBL( +, 1, e1dd273aa8a4a, -, 112 ),
-        HEX_DBL( +, 1, 4775e0840bfdd, -, 110 ),    HEX_DBL( +, 1, bd109d9d94bda, -, 109 ),
-        HEX_DBL( +, 1, 2e73f53fba844, -, 107 ),    HEX_DBL( +, 1, 9b138170d6bfe, -, 106 ),
-        HEX_DBL( +, 1, 175af0cf60ec5, -, 104 ),    HEX_DBL( +, 1, 7baee1bffa80b, -, 103 ),
-        HEX_DBL( +, 1, 02057d1245ceb, -, 101 ),    HEX_DBL( +, 1, 5eafffb34ba31, -, 100 ),
-        HEX_DBL( +, 1, dca23bae16424, -, 99 ),    HEX_DBL( +, 1, 43e7fc88b8056, -, 97 ),
-        HEX_DBL( +, 1, b83bf23a9a9eb, -, 96 ),    HEX_DBL( +, 1, 2b2b8dd05b318, -, 94 ),
-        HEX_DBL( +, 1, 969d47321e4cc, -, 93 ),    HEX_DBL( +, 1, 1452b7723aed2, -, 91 ),
-        HEX_DBL( +, 1, 778fe2497184c, -, 90 ),    HEX_DBL( +, 1, fe7116182e9cc, -, 89 ),
-        HEX_DBL( +, 1, 5ae191a99585a, -, 87 ),    HEX_DBL( +, 1, d775d87da854d, -, 86 ),
-        HEX_DBL( +, 1, 4063f8cc8bb98, -, 84 ),    HEX_DBL( +, 1, b374b315f87c1, -, 83 ),
-        HEX_DBL( +, 1, 27ec458c65e3c, -, 81 ),    HEX_DBL( +, 1, 923372c67a074, -, 80 ),
-        HEX_DBL( +, 1, 1152eaeb73c08, -, 78 ),    HEX_DBL( +, 1, 737c5645114b5, -, 77 ),
-        HEX_DBL( +, 1, f8e6c24b5592e, -, 76 ),    HEX_DBL( +, 1, 571db733a9d61, -, 74 ),
-        HEX_DBL( +, 1, d257d547e083f, -, 73 ),    HEX_DBL( +, 1, 3ce9b9de78f85, -, 71 ),
-        HEX_DBL( +, 1, aebabae3a41b5, -, 70 ),    HEX_DBL( +, 1, 24b6031b49bda, -, 68 ),
-        HEX_DBL( +, 1, 8dd5e1bb09d7e, -, 67 ),    HEX_DBL( +, 1, 0e5b73d1ff53d, -, 65 ),
-        HEX_DBL( +, 1, 6f741de1748ec, -, 64 ),    HEX_DBL( +, 1, f36bd37f42f3e, -, 63 ),
-        HEX_DBL( +, 1, 536452ee2f75c, -, 61 ),    HEX_DBL( +, 1, cd480a1b7482,  -, 60 ),
-        HEX_DBL( +, 1, 39792499b1a24, -, 58 ),    HEX_DBL( +, 1, aa0de4bf35b38, -, 57 ),
-        HEX_DBL( +, 1, 2188ad6ae3303, -, 55 ),    HEX_DBL( +, 1, 898471fca6055, -, 54 ),
-        HEX_DBL( +, 1, 0b6c3afdde064, -, 52 ),    HEX_DBL( +, 1, 6b7719a59f0e,  -, 51 ),
-        HEX_DBL( +, 1, ee001eed62aa, -, 50 ),    HEX_DBL( +, 1, 4fb547c775da8, -, 48 ),
-        HEX_DBL( +, 1, c8464f7616468, -, 47 ),    HEX_DBL( +, 1, 36121e24d3bba, -, 45 ),
-        HEX_DBL( +, 1, a56e0c2ac7f75, -, 44 ),    HEX_DBL( +, 1, 1e642baeb84a,  -, 42 ),
-        HEX_DBL( +, 1, 853f01d6d53ba, -, 41 ),    HEX_DBL( +, 1, 0885298767e9a, -, 39 ),
-        HEX_DBL( +, 1, 67852a7007e42, -, 38 ),    HEX_DBL( +, 1, e8a37a45fc32e, -, 37 ),
-        HEX_DBL( +, 1, 4c1078fe9228a, -, 35 ),    HEX_DBL( +, 1, c3527e433fab1, -, 34 ),
-        HEX_DBL( +, 1, 32b48bf117da2, -, 32 ),    HEX_DBL( +, 1, a0db0d0ddb3ec, -, 31 ),
-        HEX_DBL( +, 1, 1b48655f37267, -, 29 ),    HEX_DBL( +, 1, 81056ff2c5772, -, 28 ),
-        HEX_DBL( +, 1, 05a628c699fa1, -, 26 ),    HEX_DBL( +, 1, 639e3175a689d, -, 25 ),
-        HEX_DBL( +, 1, e355bbaee85cb, -, 24 ),    HEX_DBL( +, 1, 4875ca227ec38, -, 22 ),
-        HEX_DBL( +, 1, be6c6fdb01612, -, 21 ),    HEX_DBL( +, 1, 2f6053b981d98, -, 19 ),
-        HEX_DBL( +, 1, 9c54c3b43bc8b, -, 18 ),    HEX_DBL( +, 1, 18354238f6764, -, 16 ),
-        HEX_DBL( +, 1, 7cd79b5647c9b, -, 15 ),    HEX_DBL( +, 1, 02cf22526545a, -, 13 ),
-        HEX_DBL( +, 1, 5fc21041027ad, -, 12 ),    HEX_DBL( +, 1, de16b9c24a98f, -, 11 ),
-        HEX_DBL( +, 1, 44e51f113d4d6, -, 9 ),    HEX_DBL( +, 1, b993fe00d5376, -, 8 ),
-        HEX_DBL( +, 1, 2c155b8213cf4, -, 6 ),    HEX_DBL( +, 1, 97db0ccceb0af, -, 5 ),
-        HEX_DBL( +, 1, 152aaa3bf81cc, -, 3 ),    HEX_DBL( +, 1, 78b56362cef38, -, 2 ),
-        HEX_DBL( +, 1, 0, +, 0 ),                HEX_DBL( +, 1, 5bf0a8b145769, +, 1 ),
-        HEX_DBL( +, 1, d8e64b8d4ddae, +, 2 ),    HEX_DBL( +, 1, 415e5bf6fb106, +, 4 ),
-        HEX_DBL( +, 1, b4c902e273a58, +, 5 ),    HEX_DBL( +, 1, 28d389970338f, +, 7 ),
-        HEX_DBL( +, 1, 936dc5690c08f, +, 8 ),    HEX_DBL( +, 1, 122885aaeddaa, +, 10 ),
-        HEX_DBL( +, 1, 749ea7d470c6e, +, 11 ),    HEX_DBL( +, 1, fa7157c470f82, +, 12 ),
-        HEX_DBL( +, 1, 5829dcf95056,  +, 14 ),    HEX_DBL( +, 1, d3c4488ee4f7f, +, 15 ),
-        HEX_DBL( +, 1, 3de1654d37c9a, +, 17 ),    HEX_DBL( +, 1, b00b5916ac955, +, 18 ),
-        HEX_DBL( +, 1, 259ac48bf05d7, +, 20 ),    HEX_DBL( +, 1, 8f0ccafad2a87, +, 21 ),
-        HEX_DBL( +, 1, 0f2ebd0a8002,  +, 23 ),    HEX_DBL( +, 1, 709348c0ea4f9, +, 24 ),
-        HEX_DBL( +, 1, f4f22091940bd, +, 25 ),    HEX_DBL( +, 1, 546d8f9ed26e1, +, 27 ),
-        HEX_DBL( +, 1, ceb088b68e804, +, 28 ),    HEX_DBL( +, 1, 3a6e1fd9eecfd, +, 30 ),
-        HEX_DBL( +, 1, ab5adb9c436,   +, 31 ),    HEX_DBL( +, 1, 226af33b1fdc1, +, 33 ),
-        HEX_DBL( +, 1, 8ab7fb5475fb7, +, 34 ),    HEX_DBL( +, 1, 0c3d3920962c9, +, 36 ),
-        HEX_DBL( +, 1, 6c932696a6b5d, +, 37 ),    HEX_DBL( +, 1, ef822f7f6731d, +, 38 ),
-        HEX_DBL( +, 1, 50bba3796379a, +, 40 ),    HEX_DBL( +, 1, c9aae4631c056, +, 41 ),
-        HEX_DBL( +, 1, 370470aec28ed, +, 43 ),    HEX_DBL( +, 1, a6b765d8cdf6d, +, 44 ),
-        HEX_DBL( +, 1, 1f43fcc4b662c, +, 46 ),    HEX_DBL( +, 1, 866f34a725782, +, 47 ),
-        HEX_DBL( +, 1, 0953e2f3a1ef7, +, 49 ),    HEX_DBL( +, 1, 689e221bc8d5b, +, 50 ),
-        HEX_DBL( +, 1, ea215a1d20d76, +, 51 ),    HEX_DBL( +, 1, 4d13fbb1a001a, +, 53 ),
-        HEX_DBL( +, 1, c4b334617cc67, +, 54 ),    HEX_DBL( +, 1, 33a43d282a519, +, 56 ),
-        HEX_DBL( +, 1, a220d397972eb, +, 57 ),    HEX_DBL( +, 1, 1c25c88df6862, +, 59 ),
-        HEX_DBL( +, 1, 8232558201159, +, 60 ),    HEX_DBL( +, 1, 0672a3c9eb871, +, 62 ),
-        HEX_DBL( +, 1, 64b41c6d37832, +, 63 ),    HEX_DBL( +, 1, e4cf766fe49be, +, 64 ),
-        HEX_DBL( +, 1, 49767bc0483e3, +, 66 ),    HEX_DBL( +, 1, bfc951eb8bb76, +, 67 ),
-        HEX_DBL( +, 1, 304d6aeca254b, +, 69 ),    HEX_DBL( +, 1, 9d97010884251, +, 70 ),
-        HEX_DBL( +, 1, 19103e4080b45, +, 72 ),    HEX_DBL( +, 1, 7e013cd114461, +, 73 ),
-        HEX_DBL( +, 1, 03996528e074c, +, 75 ),    HEX_DBL( +, 1, 60d4f6fdac731, +, 76 ),
-        HEX_DBL( +, 1, df8c5af17ba3b, +, 77 ),    HEX_DBL( +, 1, 45e3076d61699, +, 79 ),
-        HEX_DBL( +, 1, baed16a6e0da7, +, 80 ),    HEX_DBL( +, 1, 2cffdfebde1a1, +, 82 ),
-        HEX_DBL( +, 1, 9919cabefcb69, +, 83 ),    HEX_DBL( +, 1, 160345c9953e3, +, 85 ),
-        HEX_DBL( +, 1, 79dbc9dc53c66, +, 86 ),    HEX_DBL( +, 1, 00c810d464097, +, 88 ),
-        HEX_DBL( +, 1, 5d009394c5c27, +, 89 ),    HEX_DBL( +, 1, da57de8f107a8, +, 90 ),
-        HEX_DBL( +, 1, 425982cf597cd, +, 92 ),    HEX_DBL( +, 1, b61e5ca3a5e31, +, 93 ),
-        HEX_DBL( +, 1, 29bb825dfcf87, +, 95 ),    HEX_DBL( +, 1, 94a90db0d6fe2, +, 96 ),
-        HEX_DBL( +, 1, 12fec759586fd, +, 98 ),    HEX_DBL( +, 1, 75c1dc469e3af, +, 99 ),
-        HEX_DBL( +, 1, fbfd219c43b04, +, 100 ),    HEX_DBL( +, 1, 5936d44e1a146, +, 102 ),
-        HEX_DBL( +, 1, d531d8a7ee79c, +, 103 ),    HEX_DBL( +, 1, 3ed9d24a2d51b, +, 105 ),
-        HEX_DBL( +, 1, b15cfe5b6e17b, +, 106 ),    HEX_DBL( +, 1, 268038c2c0e,   +, 108 ),
-        HEX_DBL( +, 1, 9044a73545d48, +, 109 ),    HEX_DBL( +, 1, 1002ab6218b38, +, 111 ),
-        HEX_DBL( +, 1, 71b3540cbf921, +, 112 ),    HEX_DBL( +, 1, f6799ea9c414a, +, 113 ),
-        HEX_DBL( +, 1, 55779b984f3eb, +, 115 ),    HEX_DBL( +, 1, d01a210c44aa4, +, 116 ),
-        HEX_DBL( +, 1, 3b63da8e9121,  +, 118 ),    HEX_DBL( +, 1, aca8d6b0116b8, +, 119 ),
-        HEX_DBL( +, 1, 234de9e0c74e9, +, 121 ),    HEX_DBL( +, 1, 8bec7503ca477, +, 122 ),
-        HEX_DBL( +, 1, 0d0eda9796b9,  +, 124 ),    HEX_DBL( +, 1, 6db0118477245, +, 125 ),
-        HEX_DBL( +, 1, f1056dc7bf22d, +, 126 ),    HEX_DBL( +, 1, 51c2cc3433801, +, 128 ),
-        HEX_DBL( +, 1, cb108ffbec164, +, 129 ),    HEX_DBL( +, 1, 37f780991b584, +, 131 ),
-        HEX_DBL( +, 1, a801c0ea8ac4d, +, 132 ),    HEX_DBL( +, 1, 20247cc4c46c1, +, 134 ),
-        HEX_DBL( +, 1, 87a0553328015, +, 135 ),    HEX_DBL( +, 1, 0a233dee4f9bb, +, 137 ),
-        HEX_DBL( +, 1, 69b7f55b808ba, +, 138 ),    HEX_DBL( +, 1, eba064644060a, +, 139 ),
-        HEX_DBL( +, 1, 4e184933d9364, +, 141 ),    HEX_DBL( +, 1, c614fe2531841, +, 142 ),
-        HEX_DBL( +, 1, 3494a9b171bf5, +, 144 ),    HEX_DBL( +, 1, a36798b9d969b, +, 145 ),
-        HEX_DBL( +, 1, 1d03d8c0c04af, +, 147 ),    HEX_DBL( +, 1, 836026385c974, +, 148 ),
-        HEX_DBL( +, 1, 073fbe9ac901d, +, 150 ),    HEX_DBL( +, 1, 65cae0969f286, +, 151 ),
-        HEX_DBL( +, 1, e64a58639cae8, +, 152 ),    HEX_DBL( +, 1, 4a77f5f9b50f9, +, 154 ),
-        HEX_DBL( +, 1, c12744a3a28e3, +, 155 ),    HEX_DBL( +, 1, 313b3b6978e85, +, 157 ),
-        HEX_DBL( +, 1, 9eda3a31e587e, +, 158 ),    HEX_DBL( +, 1, 19ebe56b56453, +, 160 ),
-        HEX_DBL( +, 1, 7f2bc6e599b7e, +, 161 ),    HEX_DBL( +, 1, 04644610df2ff, +, 163 ),
-        HEX_DBL( +, 1, 61e8b490ac4e6, +, 164 ),    HEX_DBL( +, 1, e103201f299b3, +, 165 ),
-        HEX_DBL( +, 1, 46e1b637beaf5, +, 167 ),    HEX_DBL( +, 1, bc473cfede104, +, 168 ),
-        HEX_DBL( +, 1, 2deb1b9c85e2d, +, 170 ),    HEX_DBL( +, 1, 9a5981ca67d1,  +, 171 ),
-        HEX_DBL( +, 1, 16dc8a9ef670b, +, 173 ),    HEX_DBL( +, 1, 7b03166942309, +, 174 ),
-        HEX_DBL( +, 1, 0190be03150a7, +, 176 ),    HEX_DBL( +, 1, 5e1152f9a8119, +, 177 ),
-        HEX_DBL( +, 1, dbca9263f8487, +, 178 ),    HEX_DBL( +, 1, 43556dee93bee, +, 180 ),
-        HEX_DBL( +, 1, b774c12967dfa, +, 181 ),    HEX_DBL( +, 1, 2aa4306e922c2, +, 183 ),
-        HEX_DBL( +, 1, 95e54c5dd4217, +, 184 )    };
+    static const double exp_table[128 + 150 + 1] = {
+        HEX_DBL(+, 1, 82e16284f5ec5, -, 217),
+        HEX_DBL(+, 1, 06e9996332ba1, -, 215),
+        HEX_DBL(+, 1, 6555cb289e44b, -, 214),
+        HEX_DBL(+, 1, e5ab364643354, -, 213),
+        HEX_DBL(+, 1, 4a0bd18e64df7, -, 211),
+        HEX_DBL(+, 1, c094499cc578e, -, 210),
+        HEX_DBL(+, 1, 30d759323998c, -, 208),
+        HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207),
+        HEX_DBL(+, 1, 198fa3f30be25, -, 205),
+        HEX_DBL(+, 1, 7eae636d6144e, -, 204),
+        HEX_DBL(+, 1, 040f1036f4863, -, 202),
+        HEX_DBL(+, 1, 6174e477a895f, -, 201),
+        HEX_DBL(+, 1, e065b82dd95a, -, 200),
+        HEX_DBL(+, 1, 4676be491d129, -, 198),
+        HEX_DBL(+, 1, bbb5da5f7c823, -, 197),
+        HEX_DBL(+, 1, 2d884eef5fdcb, -, 195),
+        HEX_DBL(+, 1, 99d3397ab8371, -, 194),
+        HEX_DBL(+, 1, 1681497ed15b3, -, 192),
+        HEX_DBL(+, 1, 7a870f597fdbd, -, 191),
+        HEX_DBL(+, 1, 013c74edba307, -, 189),
+        HEX_DBL(+, 1, 5d9ec4ada7938, -, 188),
+        HEX_DBL(+, 1, db2edfd20fa7c, -, 187),
+        HEX_DBL(+, 1, 42eb9f39afb0b, -, 185),
+        HEX_DBL(+, 1, b6e4f282b43f4, -, 184),
+        HEX_DBL(+, 1, 2a42764857b19, -, 182),
+        HEX_DBL(+, 1, 9560792d19314, -, 181),
+        HEX_DBL(+, 1, 137b6ce8e052c, -, 179),
+        HEX_DBL(+, 1, 766b45dd84f18, -, 178),
+        HEX_DBL(+, 1, fce362fe6e7d, -, 177),
+        HEX_DBL(+, 1, 59d34dd8a5473, -, 175),
+        HEX_DBL(+, 1, d606847fc727a, -, 174),
+        HEX_DBL(+, 1, 3f6a58b795de3, -, 172),
+        HEX_DBL(+, 1, b2216c6efdac1, -, 171),
+        HEX_DBL(+, 1, 2705b5b153fb8, -, 169),
+        HEX_DBL(+, 1, 90fa1509bd50d, -, 168),
+        HEX_DBL(+, 1, 107df698da211, -, 166),
+        HEX_DBL(+, 1, 725ae6e7b9d35, -, 165),
+        HEX_DBL(+, 1, f75d6040aeff6, -, 164),
+        HEX_DBL(+, 1, 56126259e093c, -, 162),
+        HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161),
+        HEX_DBL(+, 1, 3bf2cf6722e46, -, 159),
+        HEX_DBL(+, 1, ad6b22f55db42, -, 158),
+        HEX_DBL(+, 1, 23d1f3e5834a, -, 156),
+        HEX_DBL(+, 1, 8c9feab89b876, -, 155),
+        HEX_DBL(+, 1, 0d88cf37f00dd, -, 153),
+        HEX_DBL(+, 1, 6e55d2bf838a7, -, 152),
+        HEX_DBL(+, 1, f1e6b68529e33, -, 151),
+        HEX_DBL(+, 1, 525be4e4e601d, -, 149),
+        HEX_DBL(+, 1, cbe0a45f75eb1, -, 148),
+        HEX_DBL(+, 1, 3884e838aea68, -, 146),
+        HEX_DBL(+, 1, a8c1f14e2af5d, -, 145),
+        HEX_DBL(+, 1, 20a717e64a9bd, -, 143),
+        HEX_DBL(+, 1, 8851d84118908, -, 142),
+        HEX_DBL(+, 1, 0a9bdfb02d24, -, 140),
+        HEX_DBL(+, 1, 6a5bea046b42e, -, 139),
+        HEX_DBL(+, 1, ec7f3b269efa8, -, 138),
+        HEX_DBL(+, 1, 4eafb87eab0f2, -, 136),
+        HEX_DBL(+, 1, c6e2d05bbc, -, 135),
+        HEX_DBL(+, 1, 35208867c2683, -, 133),
+        HEX_DBL(+, 1, a425b317eeacd, -, 132),
+        HEX_DBL(+, 1, 1d8508fa8246a, -, 130),
+        HEX_DBL(+, 1, 840fbc08fdc8a, -, 129),
+        HEX_DBL(+, 1, 07b7112bc1ffe, -, 127),
+        HEX_DBL(+, 1, 666d0dad2961d, -, 126),
+        HEX_DBL(+, 1, e726c3f64d0fe, -, 125),
+        HEX_DBL(+, 1, 4b0dc07cabf98, -, 123),
+        HEX_DBL(+, 1, c1f2daf3b6a46, -, 122),
+        HEX_DBL(+, 1, 31c5957a47de2, -, 120),
+        HEX_DBL(+, 1, 9f96445648b9f, -, 119),
+        HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117),
+        HEX_DBL(+, 1, 7fd974d372e45, -, 116),
+        HEX_DBL(+, 1, 04da4d1452919, -, 114),
+        HEX_DBL(+, 1, 62891f06b345, -, 113),
+        HEX_DBL(+, 1, e1dd273aa8a4a, -, 112),
+        HEX_DBL(+, 1, 4775e0840bfdd, -, 110),
+        HEX_DBL(+, 1, bd109d9d94bda, -, 109),
+        HEX_DBL(+, 1, 2e73f53fba844, -, 107),
+        HEX_DBL(+, 1, 9b138170d6bfe, -, 106),
+        HEX_DBL(+, 1, 175af0cf60ec5, -, 104),
+        HEX_DBL(+, 1, 7baee1bffa80b, -, 103),
+        HEX_DBL(+, 1, 02057d1245ceb, -, 101),
+        HEX_DBL(+, 1, 5eafffb34ba31, -, 100),
+        HEX_DBL(+, 1, dca23bae16424, -, 99),
+        HEX_DBL(+, 1, 43e7fc88b8056, -, 97),
+        HEX_DBL(+, 1, b83bf23a9a9eb, -, 96),
+        HEX_DBL(+, 1, 2b2b8dd05b318, -, 94),
+        HEX_DBL(+, 1, 969d47321e4cc, -, 93),
+        HEX_DBL(+, 1, 1452b7723aed2, -, 91),
+        HEX_DBL(+, 1, 778fe2497184c, -, 90),
+        HEX_DBL(+, 1, fe7116182e9cc, -, 89),
+        HEX_DBL(+, 1, 5ae191a99585a, -, 87),
+        HEX_DBL(+, 1, d775d87da854d, -, 86),
+        HEX_DBL(+, 1, 4063f8cc8bb98, -, 84),
+        HEX_DBL(+, 1, b374b315f87c1, -, 83),
+        HEX_DBL(+, 1, 27ec458c65e3c, -, 81),
+        HEX_DBL(+, 1, 923372c67a074, -, 80),
+        HEX_DBL(+, 1, 1152eaeb73c08, -, 78),
+        HEX_DBL(+, 1, 737c5645114b5, -, 77),
+        HEX_DBL(+, 1, f8e6c24b5592e, -, 76),
+        HEX_DBL(+, 1, 571db733a9d61, -, 74),
+        HEX_DBL(+, 1, d257d547e083f, -, 73),
+        HEX_DBL(+, 1, 3ce9b9de78f85, -, 71),
+        HEX_DBL(+, 1, aebabae3a41b5, -, 70),
+        HEX_DBL(+, 1, 24b6031b49bda, -, 68),
+        HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67),
+        HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65),
+        HEX_DBL(+, 1, 6f741de1748ec, -, 64),
+        HEX_DBL(+, 1, f36bd37f42f3e, -, 63),
+        HEX_DBL(+, 1, 536452ee2f75c, -, 61),
+        HEX_DBL(+, 1, cd480a1b7482, -, 60),
+        HEX_DBL(+, 1, 39792499b1a24, -, 58),
+        HEX_DBL(+, 1, aa0de4bf35b38, -, 57),
+        HEX_DBL(+, 1, 2188ad6ae3303, -, 55),
+        HEX_DBL(+, 1, 898471fca6055, -, 54),
+        HEX_DBL(+, 1, 0b6c3afdde064, -, 52),
+        HEX_DBL(+, 1, 6b7719a59f0e, -, 51),
+        HEX_DBL(+, 1, ee001eed62aa, -, 50),
+        HEX_DBL(+, 1, 4fb547c775da8, -, 48),
+        HEX_DBL(+, 1, c8464f7616468, -, 47),
+        HEX_DBL(+, 1, 36121e24d3bba, -, 45),
+        HEX_DBL(+, 1, a56e0c2ac7f75, -, 44),
+        HEX_DBL(+, 1, 1e642baeb84a, -, 42),
+        HEX_DBL(+, 1, 853f01d6d53ba, -, 41),
+        HEX_DBL(+, 1, 0885298767e9a, -, 39),
+        HEX_DBL(+, 1, 67852a7007e42, -, 38),
+        HEX_DBL(+, 1, e8a37a45fc32e, -, 37),
+        HEX_DBL(+, 1, 4c1078fe9228a, -, 35),
+        HEX_DBL(+, 1, c3527e433fab1, -, 34),
+        HEX_DBL(+, 1, 32b48bf117da2, -, 32),
+        HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31),
+        HEX_DBL(+, 1, 1b48655f37267, -, 29),
+        HEX_DBL(+, 1, 81056ff2c5772, -, 28),
+        HEX_DBL(+, 1, 05a628c699fa1, -, 26),
+        HEX_DBL(+, 1, 639e3175a689d, -, 25),
+        HEX_DBL(+, 1, e355bbaee85cb, -, 24),
+        HEX_DBL(+, 1, 4875ca227ec38, -, 22),
+        HEX_DBL(+, 1, be6c6fdb01612, -, 21),
+        HEX_DBL(+, 1, 2f6053b981d98, -, 19),
+        HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18),
+        HEX_DBL(+, 1, 18354238f6764, -, 16),
+        HEX_DBL(+, 1, 7cd79b5647c9b, -, 15),
+        HEX_DBL(+, 1, 02cf22526545a, -, 13),
+        HEX_DBL(+, 1, 5fc21041027ad, -, 12),
+        HEX_DBL(+, 1, de16b9c24a98f, -, 11),
+        HEX_DBL(+, 1, 44e51f113d4d6, -, 9),
+        HEX_DBL(+, 1, b993fe00d5376, -, 8),
+        HEX_DBL(+, 1, 2c155b8213cf4, -, 6),
+        HEX_DBL(+, 1, 97db0ccceb0af, -, 5),
+        HEX_DBL(+, 1, 152aaa3bf81cc, -, 3),
+        HEX_DBL(+, 1, 78b56362cef38, -, 2),
+        HEX_DBL(+, 1, 0, +, 0),
+        HEX_DBL(+, 1, 5bf0a8b145769, +, 1),
+        HEX_DBL(+, 1, d8e64b8d4ddae, +, 2),
+        HEX_DBL(+, 1, 415e5bf6fb106, +, 4),
+        HEX_DBL(+, 1, b4c902e273a58, +, 5),
+        HEX_DBL(+, 1, 28d389970338f, +, 7),
+        HEX_DBL(+, 1, 936dc5690c08f, +, 8),
+        HEX_DBL(+, 1, 122885aaeddaa, +, 10),
+        HEX_DBL(+, 1, 749ea7d470c6e, +, 11),
+        HEX_DBL(+, 1, fa7157c470f82, +, 12),
+        HEX_DBL(+, 1, 5829dcf95056, +, 14),
+        HEX_DBL(+, 1, d3c4488ee4f7f, +, 15),
+        HEX_DBL(+, 1, 3de1654d37c9a, +, 17),
+        HEX_DBL(+, 1, b00b5916ac955, +, 18),
+        HEX_DBL(+, 1, 259ac48bf05d7, +, 20),
+        HEX_DBL(+, 1, 8f0ccafad2a87, +, 21),
+        HEX_DBL(+, 1, 0f2ebd0a8002, +, 23),
+        HEX_DBL(+, 1, 709348c0ea4f9, +, 24),
+        HEX_DBL(+, 1, f4f22091940bd, +, 25),
+        HEX_DBL(+, 1, 546d8f9ed26e1, +, 27),
+        HEX_DBL(+, 1, ceb088b68e804, +, 28),
+        HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30),
+        HEX_DBL(+, 1, ab5adb9c436, +, 31),
+        HEX_DBL(+, 1, 226af33b1fdc1, +, 33),
+        HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34),
+        HEX_DBL(+, 1, 0c3d3920962c9, +, 36),
+        HEX_DBL(+, 1, 6c932696a6b5d, +, 37),
+        HEX_DBL(+, 1, ef822f7f6731d, +, 38),
+        HEX_DBL(+, 1, 50bba3796379a, +, 40),
+        HEX_DBL(+, 1, c9aae4631c056, +, 41),
+        HEX_DBL(+, 1, 370470aec28ed, +, 43),
+        HEX_DBL(+, 1, a6b765d8cdf6d, +, 44),
+        HEX_DBL(+, 1, 1f43fcc4b662c, +, 46),
+        HEX_DBL(+, 1, 866f34a725782, +, 47),
+        HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49),
+        HEX_DBL(+, 1, 689e221bc8d5b, +, 50),
+        HEX_DBL(+, 1, ea215a1d20d76, +, 51),
+        HEX_DBL(+, 1, 4d13fbb1a001a, +, 53),
+        HEX_DBL(+, 1, c4b334617cc67, +, 54),
+        HEX_DBL(+, 1, 33a43d282a519, +, 56),
+        HEX_DBL(+, 1, a220d397972eb, +, 57),
+        HEX_DBL(+, 1, 1c25c88df6862, +, 59),
+        HEX_DBL(+, 1, 8232558201159, +, 60),
+        HEX_DBL(+, 1, 0672a3c9eb871, +, 62),
+        HEX_DBL(+, 1, 64b41c6d37832, +, 63),
+        HEX_DBL(+, 1, e4cf766fe49be, +, 64),
+        HEX_DBL(+, 1, 49767bc0483e3, +, 66),
+        HEX_DBL(+, 1, bfc951eb8bb76, +, 67),
+        HEX_DBL(+, 1, 304d6aeca254b, +, 69),
+        HEX_DBL(+, 1, 9d97010884251, +, 70),
+        HEX_DBL(+, 1, 19103e4080b45, +, 72),
+        HEX_DBL(+, 1, 7e013cd114461, +, 73),
+        HEX_DBL(+, 1, 03996528e074c, +, 75),
+        HEX_DBL(+, 1, 60d4f6fdac731, +, 76),
+        HEX_DBL(+, 1, df8c5af17ba3b, +, 77),
+        HEX_DBL(+, 1, 45e3076d61699, +, 79),
+        HEX_DBL(+, 1, baed16a6e0da7, +, 80),
+        HEX_DBL(+, 1, 2cffdfebde1a1, +, 82),
+        HEX_DBL(+, 1, 9919cabefcb69, +, 83),
+        HEX_DBL(+, 1, 160345c9953e3, +, 85),
+        HEX_DBL(+, 1, 79dbc9dc53c66, +, 86),
+        HEX_DBL(+, 1, 00c810d464097, +, 88),
+        HEX_DBL(+, 1, 5d009394c5c27, +, 89),
+        HEX_DBL(+, 1, da57de8f107a8, +, 90),
+        HEX_DBL(+, 1, 425982cf597cd, +, 92),
+        HEX_DBL(+, 1, b61e5ca3a5e31, +, 93),
+        HEX_DBL(+, 1, 29bb825dfcf87, +, 95),
+        HEX_DBL(+, 1, 94a90db0d6fe2, +, 96),
+        HEX_DBL(+, 1, 12fec759586fd, +, 98),
+        HEX_DBL(+, 1, 75c1dc469e3af, +, 99),
+        HEX_DBL(+, 1, fbfd219c43b04, +, 100),
+        HEX_DBL(+, 1, 5936d44e1a146, +, 102),
+        HEX_DBL(+, 1, d531d8a7ee79c, +, 103),
+        HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105),
+        HEX_DBL(+, 1, b15cfe5b6e17b, +, 106),
+        HEX_DBL(+, 1, 268038c2c0e, +, 108),
+        HEX_DBL(+, 1, 9044a73545d48, +, 109),
+        HEX_DBL(+, 1, 1002ab6218b38, +, 111),
+        HEX_DBL(+, 1, 71b3540cbf921, +, 112),
+        HEX_DBL(+, 1, f6799ea9c414a, +, 113),
+        HEX_DBL(+, 1, 55779b984f3eb, +, 115),
+        HEX_DBL(+, 1, d01a210c44aa4, +, 116),
+        HEX_DBL(+, 1, 3b63da8e9121, +, 118),
+        HEX_DBL(+, 1, aca8d6b0116b8, +, 119),
+        HEX_DBL(+, 1, 234de9e0c74e9, +, 121),
+        HEX_DBL(+, 1, 8bec7503ca477, +, 122),
+        HEX_DBL(+, 1, 0d0eda9796b9, +, 124),
+        HEX_DBL(+, 1, 6db0118477245, +, 125),
+        HEX_DBL(+, 1, f1056dc7bf22d, +, 126),
+        HEX_DBL(+, 1, 51c2cc3433801, +, 128),
+        HEX_DBL(+, 1, cb108ffbec164, +, 129),
+        HEX_DBL(+, 1, 37f780991b584, +, 131),
+        HEX_DBL(+, 1, a801c0ea8ac4d, +, 132),
+        HEX_DBL(+, 1, 20247cc4c46c1, +, 134),
+        HEX_DBL(+, 1, 87a0553328015, +, 135),
+        HEX_DBL(+, 1, 0a233dee4f9bb, +, 137),
+        HEX_DBL(+, 1, 69b7f55b808ba, +, 138),
+        HEX_DBL(+, 1, eba064644060a, +, 139),
+        HEX_DBL(+, 1, 4e184933d9364, +, 141),
+        HEX_DBL(+, 1, c614fe2531841, +, 142),
+        HEX_DBL(+, 1, 3494a9b171bf5, +, 144),
+        HEX_DBL(+, 1, a36798b9d969b, +, 145),
+        HEX_DBL(+, 1, 1d03d8c0c04af, +, 147),
+        HEX_DBL(+, 1, 836026385c974, +, 148),
+        HEX_DBL(+, 1, 073fbe9ac901d, +, 150),
+        HEX_DBL(+, 1, 65cae0969f286, +, 151),
+        HEX_DBL(+, 1, e64a58639cae8, +, 152),
+        HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154),
+        HEX_DBL(+, 1, c12744a3a28e3, +, 155),
+        HEX_DBL(+, 1, 313b3b6978e85, +, 157),
+        HEX_DBL(+, 1, 9eda3a31e587e, +, 158),
+        HEX_DBL(+, 1, 19ebe56b56453, +, 160),
+        HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161),
+        HEX_DBL(+, 1, 04644610df2ff, +, 163),
+        HEX_DBL(+, 1, 61e8b490ac4e6, +, 164),
+        HEX_DBL(+, 1, e103201f299b3, +, 165),
+        HEX_DBL(+, 1, 46e1b637beaf5, +, 167),
+        HEX_DBL(+, 1, bc473cfede104, +, 168),
+        HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170),
+        HEX_DBL(+, 1, 9a5981ca67d1, +, 171),
+        HEX_DBL(+, 1, 16dc8a9ef670b, +, 173),
+        HEX_DBL(+, 1, 7b03166942309, +, 174),
+        HEX_DBL(+, 1, 0190be03150a7, +, 176),
+        HEX_DBL(+, 1, 5e1152f9a8119, +, 177),
+        HEX_DBL(+, 1, dbca9263f8487, +, 178),
+        HEX_DBL(+, 1, 43556dee93bee, +, 180),
+        HEX_DBL(+, 1, b774c12967dfa, +, 181),
+        HEX_DBL(+, 1, 2aa4306e922c2, +, 183),
+        HEX_DBL(+, 1, 95e54c5dd4217, +, 184)
+    };
 
-    // scale by e**i --  (expm1(f) + 1)*e**i - 1  = expm1(f) * e**i + e**i - 1 = e**i
-    return exp_table[exponent+150] + (f * exp_table[exponent+150] - 1.0);
+    // scale by e**i --  (expm1(f) + 1)*e**i - 1  = expm1(f) * e**i + e**i - 1 =
+    // e**i
+    return exp_table[exponent + 150] + (f * exp_table[exponent + 150] - 1.0);
 }
 
 
-double reference_fmax( double x, double y )
+double reference_fmax(double x, double y)
 {
-    if( isnan(y) )
-        return x;
+    if (isnan(y)) return x;
 
     return x >= y ? x : y;
 }
 
-double reference_fmin( double x, double y )
+double reference_fmin(double x, double y)
 {
-    if( isnan(y) )
-        return x;
+    if (isnan(y)) return x;
 
     return x <= y ? x : y;
 }
 
-double reference_hypot( double x, double y )
+double reference_hypot(double x, double y)
 {
-    // Since the inputs are actually floats, we don't have to worry about range here
-    if( isinf(x) || isinf(y) )
-        return INFINITY;
+    // Since the inputs are actually floats, we don't have to worry about range
+    // here
+    if (isinf(x) || isinf(y)) return INFINITY;
 
-    return sqrt( x * x + y * y );
+    return sqrt(x * x + y * y);
 }
 
-int    reference_ilogbl( long double x)
+int reference_ilogbl(long double x)
 {
     extern int gDeviceILogb0, gDeviceILogbNaN;
 
     // Since we are just using this to verify double precision, we can
     // use the double precision ilogb here
-    union { double f; cl_ulong u;} u;
-    u.f = (double) x;
+    union {
+        double f;
+        cl_ulong u;
+    } u;
+    u.f = (double)x;
 
     int exponent = (int)(u.u >> 52) & 0x7ff;
-    if( exponent == 0x7ff )
+    if (exponent == 0x7ff)
     {
-        if( u.u & 0x000fffffffffffffULL )
-            return gDeviceILogbNaN;
+        if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN;
 
         return CL_INT_MAX;
     }
 
-    if( exponent == 0 )
-    {   // deal with denormals
-        u.f =  x * HEX_DBL( +, 1, 0, +, 64 );
+    if (exponent == 0)
+    { // deal with denormals
+        u.f = x * HEX_DBL(+, 1, 0, +, 64);
         exponent = (cl_uint)(u.u >> 52) & 0x7ff;
-        if( exponent == 0 )
-            return gDeviceILogb0;
+        if (exponent == 0) return gDeviceILogb0;
 
         exponent -= 1023 + 64;
         return exponent;
@@ -1522,84 +1763,111 @@ int    reference_ilogbl( long double x)
     return exponent - 1023;
 }
 
-//double reference_log2( double x )
+// double reference_log2( double x )
 //{
 //    return log( x ) * 1.44269504088896340735992468100189214;
 //}
 
 
-double reference_relaxed_log2( double x )
+double reference_relaxed_log2(double x) { return reference_log2(x); }
+
+double reference_log2(double x)
 {
-  return reference_log2(x);
-}
+    if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan();
 
-double reference_log2( double x )
-{
-    if( isnan(x) || x < 0.0 || x == -INFINITY)
-        return cl_make_nan();
+    if (x == 0.0f) return -INFINITY;
 
-    if( x == 0.0f)
-        return -INFINITY;
-
-    if( x == INFINITY )
-        return INFINITY;
+    if (x == INFINITY) return INFINITY;
 
     double hi, lo;
-    __log2_ep( &hi, &lo, x );
+    __log2_ep(&hi, &lo, x);
     return hi;
 }
 
-double reference_log1p( double x )
-{   // This function is suitable only for verifying log1pf(). It produces several double precision ulps of error.
+double reference_log1p(double x)
+{ // This function is suitable only for verifying log1pf(). It produces several
+  // double precision ulps of error.
 
     // Handle small and NaN
-    if( ! ( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 53 ) ) )
-        return x;
+    if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 53))) return x;
 
     // deal with special values
-    if( x <= -1.0 )
+    if (x <= -1.0)
     {
-        if( x < -1.0 )
-            return cl_make_nan();
+        if (x < -1.0) return cl_make_nan();
         return -INFINITY;
     }
 
     // infinity
-    if( x == INFINITY )
-        return INFINITY;
+    if (x == INFINITY) return INFINITY;
 
-    // High precision result for when near 0, to avoid problems with the reference result falling in the wrong binade.
-    if( reference_fabs(x) < HEX_DBL( +, 1, 0, -, 28 ) )
-        return (1.0 - 0.5 * x) * x;
+    // High precision result for when near 0, to avoid problems with the
+    // reference result falling in the wrong binade.
+    if (reference_fabs(x) < HEX_DBL(+, 1, 0, -, 28)) return (1.0 - 0.5 * x) * x;
 
     // Our polynomial is only good in the region +-2**-4.
     // If we aren't in that range then we need to reduce to be in that range
-    double correctionLo = -0.0;           // correction down stream to compensate for the reduction, if any
-    double correctionHi = -0.0;           // correction down stream to compensate for the exponent, if any
-    if( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 4 ) )
+    double correctionLo =
+        -0.0; // correction down stream to compensate for the reduction, if any
+    double correctionHi =
+        -0.0; // correction down stream to compensate for the exponent, if any
+    if (reference_fabs(x) > HEX_DBL(+, 1, 0, -, 4))
     {
-        x += 1.0;   // double should cover any loss of precision here
+        x += 1.0; // double should cover any loss of precision here
 
         // separate x into (1+f) * 2**i
-        union{ double d; cl_ulong u;} u;        u.d = x;
-        int i = (int) ((u.u >> 52) & 0x7ff) - 1023;
+        union {
+            double d;
+            cl_ulong u;
+        } u;
+        u.d = x;
+        int i = (int)((u.u >> 52) & 0x7ff) - 1023;
         u.u &= 0x000fffffffffffffULL;
-        int index = (int) (u.u >> 48 );
+        int index = (int)(u.u >> 48);
         u.u |= 0x3ff0000000000000ULL;
         double f = u.d;
 
         // further reduce f to be within 1/16 of 1.0
-        static const double scale_table[16] = {                  1.0, HEX_DBL( +, 1, d2d2d2d6e3f79, -, 1 ), HEX_DBL( +, 1, b8e38e42737a1, -, 1 ), HEX_DBL( +, 1, a1af28711adf3, -, 1 ),
-                                                HEX_DBL( +, 1, 8cccccd88dd65, -, 1 ), HEX_DBL( +, 1, 79e79e810ec8f, -, 1 ), HEX_DBL( +, 1, 68ba2e94df404, -, 1 ), HEX_DBL( +, 1, 590b216defb29, -, 1 ),
-                                                HEX_DBL( +, 1, 4aaaaab1500ed, -, 1 ), HEX_DBL( +, 1, 3d70a3e0d6f73, -, 1 ), HEX_DBL( +, 1, 313b13bb39f4f, -, 1 ), HEX_DBL( +, 1, 25ed09823f1cc, -, 1 ),
-                                                HEX_DBL( +, 1, 1b6db6e77457b, -, 1 ), HEX_DBL( +, 1, 11a7b96a3a34f, -, 1 ), HEX_DBL( +, 1, 0888888e46fea, -, 1 ), HEX_DBL( +, 1, 00000038e9862, -, 1 ) };
+        static const double scale_table[16] = {
+            1.0,
+            HEX_DBL(+, 1, d2d2d2d6e3f79, -, 1),
+            HEX_DBL(+, 1, b8e38e42737a1, -, 1),
+            HEX_DBL(+, 1, a1af28711adf3, -, 1),
+            HEX_DBL(+, 1, 8cccccd88dd65, -, 1),
+            HEX_DBL(+, 1, 79e79e810ec8f, -, 1),
+            HEX_DBL(+, 1, 68ba2e94df404, -, 1),
+            HEX_DBL(+, 1, 590b216defb29, -, 1),
+            HEX_DBL(+, 1, 4aaaaab1500ed, -, 1),
+            HEX_DBL(+, 1, 3d70a3e0d6f73, -, 1),
+            HEX_DBL(+, 1, 313b13bb39f4f, -, 1),
+            HEX_DBL(+, 1, 25ed09823f1cc, -, 1),
+            HEX_DBL(+, 1, 1b6db6e77457b, -, 1),
+            HEX_DBL(+, 1, 11a7b96a3a34f, -, 1),
+            HEX_DBL(+, 1, 0888888e46fea, -, 1),
+            HEX_DBL(+, 1, 00000038e9862, -, 1)
+        };
 
         // correction_table[i] = -log( scale_table[i] )
-        // All entries have >= 64 bits of precision (rather than the expected 53)
-        static const double correction_table[16] = {                   -0.0, HEX_DBL( +, 1, 7a5c722c16058, -, 4 ), HEX_DBL( +, 1, 323db16c89ab1, -, 3 ), HEX_DBL( +, 1, a0f87d180629, -, 3 ),
-                                                       HEX_DBL( +, 1, 050279324e17c, -, 2 ), HEX_DBL( +, 1, 36f885bb270b0, -, 2 ), HEX_DBL( +, 1, 669b771b5cc69, -, 2 ), HEX_DBL( +, 1, 94203a6292a05, -, 2 ),
-                                                       HEX_DBL( +, 1, bfb4f9cb333a4, -, 2 ), HEX_DBL( +, 1, e982376ddb80e, -, 2 ), HEX_DBL( +, 1, 08d5d8769b2b2, -, 1 ), HEX_DBL( +, 1, 1c288bc00e0cf, -, 1 ),
-                                                       HEX_DBL( +, 1, 2ec7535b31ecb, -, 1 ), HEX_DBL( +, 1, 40bed0adc63fb, -, 1 ), HEX_DBL( +, 1, 521a5c0330615, -, 1 ), HEX_DBL( +, 1, 62e42f7dd092c, -, 1 ) };
+        // All entries have >= 64 bits of precision (rather than the expected
+        // 53)
+        static const double correction_table[16] = {
+            -0.0,
+            HEX_DBL(+, 1, 7a5c722c16058, -, 4),
+            HEX_DBL(+, 1, 323db16c89ab1, -, 3),
+            HEX_DBL(+, 1, a0f87d180629, -, 3),
+            HEX_DBL(+, 1, 050279324e17c, -, 2),
+            HEX_DBL(+, 1, 36f885bb270b0, -, 2),
+            HEX_DBL(+, 1, 669b771b5cc69, -, 2),
+            HEX_DBL(+, 1, 94203a6292a05, -, 2),
+            HEX_DBL(+, 1, bfb4f9cb333a4, -, 2),
+            HEX_DBL(+, 1, e982376ddb80e, -, 2),
+            HEX_DBL(+, 1, 08d5d8769b2b2, -, 1),
+            HEX_DBL(+, 1, 1c288bc00e0cf, -, 1),
+            HEX_DBL(+, 1, 2ec7535b31ecb, -, 1),
+            HEX_DBL(+, 1, 40bed0adc63fb, -, 1),
+            HEX_DBL(+, 1, 521a5c0330615, -, 1),
+            HEX_DBL(+, 1, 62e42f7dd092c, -, 1)
+        };
 
         f *= scale_table[index];
         correctionLo = correction_table[index];
@@ -1611,17 +1879,25 @@ double reference_log1p( double x )
     }
 
 
-    // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x = [-1/16, 1/16]
+    // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x =
+    // [-1/16, 1/16]
     //          max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 )
-    double p = HEX_DBL( -, 1, cc33de97a9d7b,  -, 46 ) +
-               (HEX_DBL( -, 1, fffffffff3eb7, -, 2 ) +
-               (HEX_DBL( +, 1, 5555555633ef7, -, 2 ) +
-               (HEX_DBL( -, 1, 00000062c78,   -, 2 ) +
-               (HEX_DBL( +, 1, 9999958a3321,  -, 3 ) +
-               (HEX_DBL( -, 1, 55534ce65c347, -, 3 ) +
-               (HEX_DBL( +, 1, 24957208391a5, -, 3 ) +
-               (HEX_DBL( -, 1, 02287b9a5b4a1, -, 3 ) +
-                HEX_DBL( +, 1, c757d922180ed, -, 4 ) * x)*x)*x)*x)*x)*x)*x)*x;
+    double p = HEX_DBL(-, 1, cc33de97a9d7b, -, 46)
+        + (HEX_DBL(-, 1, fffffffff3eb7, -, 2)
+           + (HEX_DBL(+, 1, 5555555633ef7, -, 2)
+              + (HEX_DBL(-, 1, 00000062c78, -, 2)
+                 + (HEX_DBL(+, 1, 9999958a3321, -, 3)
+                    + (HEX_DBL(-, 1, 55534ce65c347, -, 3)
+                       + (HEX_DBL(+, 1, 24957208391a5, -, 3)
+                          + (HEX_DBL(-, 1, 02287b9a5b4a1, -, 3)
+                             + HEX_DBL(+, 1, c757d922180ed, -, 4) * x)
+                              * x)
+                           * x)
+                        * x)
+                     * x)
+                  * x)
+               * x)
+            * x;
 
     // log(x+1) = x * p(x) + x
     x += x * p;
@@ -1629,22 +1905,23 @@ double reference_log1p( double x )
     return correctionHi + (correctionLo + x);
 }
 
-double reference_logb( double x )
+double reference_logb(double x)
 {
-    union { float f; cl_uint u;} u;
-    u.f = (float) x;
+    union {
+        float f;
+        cl_uint u;
+    } u;
+    u.f = (float)x;
 
     cl_int exponent = (u.u >> 23) & 0xff;
-    if( exponent == 0xff )
-        return x * x;
+    if (exponent == 0xff) return x * x;
 
-    if( exponent == 0 )
-    {   // deal with denormals
+    if (exponent == 0)
+    { // deal with denormals
         u.u = (u.u & 0x007fffff) | 0x3f800000;
         u.f -= 1.0f;
         exponent = (u.u >> 23) & 0xff;
-        if( exponent == 0 )
-            return -INFINITY;
+        if (exponent == 0) return -INFINITY;
 
         return exponent - (127 + 126);
     }
@@ -1652,219 +1929,271 @@ double reference_logb( double x )
     return exponent - 127;
 }
 
-double reference_relaxed_reciprocal(double x)
-{
-  return 1.0f / ((float) x);
-}
+double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); }
 
-double reference_reciprocal( double x )
-{
-  return 1.0 / x;
-}
+double reference_reciprocal(double x) { return 1.0 / x; }
 
-double reference_remainder( double x, double y )
+double reference_remainder(double x, double y)
 {
     int i;
-    return reference_remquo( x, y, &i );
+    return reference_remquo(x, y, &i);
 }
 
-double reference_lgamma( double x)
+double reference_lgamma(double x)
 {
-/*
- * ====================================================
- * This function is from fdlibm. http://www.netlib.org
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- *
- */
+    /*
+     * ====================================================
+     * This function is from fdlibm. http://www.netlib.org
+     * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     *
+     */
 
-static const double //two52 = 4.50359962737049600000e+15, /* 0x43300000, 0x00000000 */
-                    half=  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
-                    one =  1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
-                    pi  =  3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
-                    a0  =  7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
-                    a1  =  3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
-                    a2  =  6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
-                    a3  =  2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
-                    a4  =  7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
-                    a5  =  2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
-                    a6  =  1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
-                    a7  =  5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
-                    a8  =  2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
-                    a9  =  1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
-                    a10 =  2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
-                    a11 =  4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
-                    tc  =  1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
-                    tf  = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
-                    /* tt = -(tail of tf) */
-                    tt  = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
-                    t0  =  4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
-                    t1  = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
-                    t2  =  6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
-                    t3  = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
-                    t4  =  1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
-                    t5  = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
-                    t6  =  6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
-                    t7  = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
-                    t8  =  2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
-                    t9  = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
-                    t10 =  8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
-                    t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
-                    t12 =  3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
-                    t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
-                    t14 =  3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
-                    u0  = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
-                    u1  =  6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
-                    u2  =  1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
-                    u3  =  9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
-                    u4  =  2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
-                    u5  =  1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
-                    v1  =  2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
-                    v2  =  2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
-                    v3  =  7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
-                    v4  =  1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
-                    v5  =  3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
-                    s0  = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
-                    s1  =  2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
-                    s2  =  3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
-                    s3  =  1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
-                    s4  =  2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
-                    s5  =  1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
-                    s6  =  3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
-                    r1  =  1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
-                    r2  =  7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
-                    r3  =  1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
-                    r4  =  1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
-                    r5  =  7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
-                    r6  =  7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
-                    w0  =  4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
-                    w1  =  8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
-                    w2  = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
-                    w3  =  7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
-                    w4  = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
-                    w5  =  8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
-                    w6  = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
+    static const double // two52 = 4.50359962737049600000e+15, /* 0x43300000,
+                        // 0x00000000 */
+        half = 5.00000000000000000000e-01, /* 0x3FE00000,
+                                              0x00000000 */
+        one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
+        pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
+        a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
+        a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
+        a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
+        a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
+        a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
+        a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
+        a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
+        a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
+        a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
+        a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
+        a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
+        a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
+        tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
+        tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
+        /* tt = -(tail of tf) */
+        tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
+        t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
+        t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
+        t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
+        t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
+        t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
+        t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
+        t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
+        t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
+        t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
+        t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
+        t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
+        t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
+        t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
+        t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
+        t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
+        u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
+        u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
+        u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
+        u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
+        u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
+        u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
+        v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
+        v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
+        v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
+        v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
+        v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
+        s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
+        s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
+        s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
+        s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
+        s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
+        s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
+        s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
+        r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
+        r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
+        r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
+        r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
+        r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
+        r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
+        w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
+        w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
+        w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
+        w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
+        w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
+        w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
+        w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
 
-    static const double zero=  0.00000000000000000000e+00;
-    double t,y,z,nadj,p,p1,p2,p3,q,r,w;
-    cl_int i,hx,lx,ix;
+    static const double zero = 0.00000000000000000000e+00;
+    double t, y, z, nadj, p, p1, p2, p3, q, r, w;
+    cl_int i, hx, lx, ix;
 
-    union{ double d; cl_ulong u;}u; u.d = x;
+    union {
+        double d;
+        cl_ulong u;
+    } u;
+    u.d = x;
 
-    hx = (cl_int) (u.u >> 32);
-    lx = (cl_int) (u.u & 0xffffffffULL);
+    hx = (cl_int)(u.u >> 32);
+    lx = (cl_int)(u.u & 0xffffffffULL);
 
     /* purge off +-inf, NaN, +-0, and negative arguments */
-//    *signgamp = 1;
-    ix = hx&0x7fffffff;
-    if(ix>=0x7ff00000) return x*x;
-    if((ix|lx)==0) return INFINITY;
-    if(ix<0x3b900000) {    /* |x|<2**-70, return -log(|x|) */
-        if(hx<0) {
-//            *signgamp = -1;
+    //    *signgamp = 1;
+    ix = hx & 0x7fffffff;
+    if (ix >= 0x7ff00000) return x * x;
+    if ((ix | lx) == 0) return INFINITY;
+    if (ix < 0x3b900000)
+    { /* |x|<2**-70, return -log(|x|) */
+        if (hx < 0)
+        {
+            //            *signgamp = -1;
             return -reference_log(-x);
-        } else return -reference_log(x);
+        }
+        else
+            return -reference_log(x);
     }
-    if(hx<0) {
-        if(ix>=0x43300000)     /* |x|>=2**52, must be -integer */
-        return INFINITY;
+    if (hx < 0)
+    {
+        if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */
+            return INFINITY;
         t = reference_sinpi(x);
-        if(t==zero) return INFINITY; /* -integer */
-        nadj = reference_log(pi/reference_fabs(t*x));
-//        if(t<zero) *signgamp = -1;
+        if (t == zero) return INFINITY; /* -integer */
+        nadj = reference_log(pi / reference_fabs(t * x));
+        //        if(t<zero) *signgamp = -1;
         x = -x;
     }
 
     /* purge off 1 and 2 */
-    if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0;
+    if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0))
+        r = 0;
     /* for x < 2.0 */
-    else if(ix<0x40000000) {
-        if(ix<=0x3feccccc) {     /* lgamma(x) = lgamma(x+1)-log(x) */
-        r = -reference_log(x);
-        if(ix>=0x3FE76944) {y = 1.0-x; i= 0;}
-        else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
-          else {y = x; i=2;}
-        } else {
-          r = zero;
-            if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
-            else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
-        else {y=x-one;i=2;}
+    else if (ix < 0x40000000)
+    {
+        if (ix <= 0x3feccccc)
+        { /* lgamma(x) = lgamma(x+1)-log(x) */
+            r = -reference_log(x);
+            if (ix >= 0x3FE76944)
+            {
+                y = 1.0 - x;
+                i = 0;
+            }
+            else if (ix >= 0x3FCDA661)
+            {
+                y = x - (tc - one);
+                i = 1;
+            }
+            else
+            {
+                y = x;
+                i = 2;
+            }
         }
-        switch(i) {
-          case 0:
-        z = y*y;
-        p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
-        p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
-        p  = y*p1+p2;
-        r  += (p-0.5*y); break;
-          case 1:
-        z = y*y;
-        w = z*y;
-        p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12)));    /* parallel comp */
-        p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
-        p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
-        p  = z*p1-(tt-w*(p2+y*p3));
-        r += (tf + p); break;
-          case 2:
-        p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
-        p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
-        r += (-0.5*y + p1/p2);
+        else
+        {
+            r = zero;
+            if (ix >= 0x3FFBB4C3)
+            {
+                y = 2.0 - x;
+                i = 0;
+            } /* [1.7316,2] */
+            else if (ix >= 0x3FF3B4C4)
+            {
+                y = x - tc;
+                i = 1;
+            } /* [1.23,1.73] */
+            else
+            {
+                y = x - one;
+                i = 2;
+            }
+        }
+        switch (i)
+        {
+            case 0:
+                z = y * y;
+                p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+                p2 = z
+                    * (a1
+                       + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+                p = y * p1 + p2;
+                r += (p - 0.5 * y);
+                break;
+            case 1:
+                z = y * y;
+                w = z * y;
+                p1 = t0
+                    + w
+                        * (t3
+                           + w * (t6 + w * (t9 + w * t12))); /* parallel comp */
+                p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+                p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+                p = z * p1 - (tt - w * (p2 + y * p3));
+                r += (tf + p);
+                break;
+            case 2:
+                p1 = y
+                    * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+                p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+                r += (-0.5 * y + p1 / p2);
         }
     }
-    else if(ix<0x40200000) {             /* x < 8.0 */
+    else if (ix < 0x40200000)
+    { /* x < 8.0 */
         i = (int)x;
         t = zero;
-        y = x-(double)i;
-        p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
-        q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
-        r = half*y+p/q;
-        z = one;    /* lgamma(1+s) = log(s) + lgamma(s) */
-        switch(i) {
-        case 7: z *= (y+6.0);    /* FALLTHRU */
-        case 6: z *= (y+5.0);    /* FALLTHRU */
-        case 5: z *= (y+4.0);    /* FALLTHRU */
-        case 4: z *= (y+3.0);    /* FALLTHRU */
-        case 3: z *= (y+2.0);    /* FALLTHRU */
-            r += reference_log(z); break;
+        y = x - (double)i;
+        p = y
+            * (s0
+               + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
+        q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+        r = half * y + p / q;
+        z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
+        switch (i)
+        {
+            case 7: z *= (y + 6.0); /* FALLTHRU */
+            case 6: z *= (y + 5.0); /* FALLTHRU */
+            case 5: z *= (y + 4.0); /* FALLTHRU */
+            case 4: z *= (y + 3.0); /* FALLTHRU */
+            case 3:
+                z *= (y + 2.0); /* FALLTHRU */
+                r += reference_log(z);
+                break;
         }
-    /* 8.0 <= x < 2**58 */
-    } else if (ix < 0x43900000) {
+        /* 8.0 <= x < 2**58 */
+    }
+    else if (ix < 0x43900000)
+    {
         t = reference_log(x);
-        z = one/x;
-        y = z*z;
-        w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6)))));
-        r = (x-half)*(t-one)+w;
-    } else
-    /* 2**58 <= x <= inf */
-        r =  x*(reference_log(x)-one);
-    if(hx<0) r = nadj - r;
+        z = one / x;
+        y = z * z;
+        w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+        r = (x - half) * (t - one) + w;
+    }
+    else
+        /* 2**58 <= x <= inf */
+        r = x * (reference_log(x) - one);
+    if (hx < 0) r = nadj - r;
     return r;
-
 }
 
 #endif // _MSC_VER
 
-double reference_assignment( double x ){ return x; }
+double reference_assignment(double x) { return x; }
 
-int reference_not( double x )
+int reference_not(double x)
 {
-  int r = !x;
-  return r;
+    int r = !x;
+    return r;
 }
 
 #pragma mark -
 #pragma mark Double testing
 
 #ifndef M_PIL
-    #define M_PIL        3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
+#define M_PIL                                                                  \
+    3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L
 #endif
 
-static long double reduce1l( long double x );
+static long double reduce1l(long double x);
 
 #ifdef __PPC__
 // Since long double on PPC is really extended precision double arithmetic
@@ -1873,36 +2202,35 @@ static long double reduce1l( long double x );
 // such that reduction algorithm used for other architectures will not work.
 // Instead and alternate reduction method is used.
 
-static long double reduce1l( long double x )
+static long double reduce1l(long double x)
 {
-  union {
-    long double ld;
-    double d[2];
-  } u;
+    union {
+        long double ld;
+        double d[2];
+    } u;
 
-  // Reduce the high and low halfs separately.
-  u.ld = x;
-  return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
+    // Reduce the high and low halfs separately.
+    u.ld = x;
+    return ((long double)reduce1(u.d[0]) + reduce1(u.d[1]));
 }
 
 #else // !__PPC__
 
-static long double reduce1l( long double x )
+static long double reduce1l(long double x)
 {
     static long double unit_exp = 0;
-    if( 0.0L == unit_exp )
-        unit_exp = scalbnl( 1.0L, LDBL_MANT_DIG);
+    if (0.0L == unit_exp) unit_exp = scalbnl(1.0L, LDBL_MANT_DIG);
 
-    if( reference_fabsl(x) >= unit_exp )
+    if (reference_fabsl(x) >= unit_exp)
     {
-        if( reference_fabsl(x) == INFINITY )
-            return cl_make_nan();
+        if (reference_fabsl(x) == INFINITY) return cl_make_nan();
 
-        return 0.0L; //we patch up the sign for sinPi and cosPi later, since they need different signs
+        return 0.0L; // we patch up the sign for sinPi and cosPi later, since
+                     // they need different signs
     }
 
     // Find the nearest multiple of 2
-    const long double r = reference_copysignl( unit_exp, x );
+    const long double r = reference_copysignl(unit_exp, x);
     long double z = x + r;
     z -= r;
 
@@ -1911,19 +2239,31 @@ static long double reduce1l( long double x )
 }
 #endif // __PPC__
 
-long double reference_acospil( long double x){  return reference_acosl( x ) / M_PIL;    }
-long double reference_asinpil( long double x){  return reference_asinl( x ) / M_PIL;    }
-long double reference_atanpil( long double x){  return reference_atanl( x ) / M_PIL;    }
-long double reference_atan2pil( long double y, long double x){ return reference_atan2l( y, x) / M_PIL; }
-long double reference_cospil( long double x)
+long double reference_acospil(long double x)
 {
-    if( reference_fabsl(x) >= HEX_LDBL( +, 1, 0, +, 54 ) )
+    return reference_acosl(x) / M_PIL;
+}
+long double reference_asinpil(long double x)
+{
+    return reference_asinl(x) / M_PIL;
+}
+long double reference_atanpil(long double x)
+{
+    return reference_atanl(x) / M_PIL;
+}
+long double reference_atan2pil(long double y, long double x)
+{
+    return reference_atan2l(y, x) / M_PIL;
+}
+long double reference_cospil(long double x)
+{
+    if (reference_fabsl(x) >= HEX_LDBL(+, 1, 0, +, 54))
     {
-        if( reference_fabsl(x) == INFINITY )
-            return cl_make_nan();
+        if (reference_fabsl(x) == INFINITY) return cl_make_nan();
 
-        //Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53.
-        //However, when starting with single precision inputs, there will be no odd values.
+        // Note this probably fails for odd values between 0x1.0p52 and
+        // 0x1.0p53. However, when starting with single precision inputs, there
+        // will be no odd values.
 
         return 1.0L;
     }
@@ -1935,9 +2275,9 @@ long double reference_cospil( long double x)
     // phase adjust
     double xhi = 0.0;
     double xlo = 0.0;
-    xhi = (double) x + 0.5;
+    xhi = (double)x + 0.5;
 
-    if(reference_fabsl(x) > 0.5L)
+    if (reference_fabsl(x) > 0.5L)
     {
         xlo = xhi - x;
         xlo = 0.5 - xlo;
@@ -1949,61 +2289,69 @@ long double reference_cospil( long double x)
     }
 
     // reduce to [-0.5, 0.5]
-    if( xhi < -0.5 )
+    if (xhi < -0.5)
     {
         xhi = -1.0 - xhi;
         xlo = -xlo;
     }
-    else if ( xhi > 0.5 )
+    else if (xhi > 0.5)
     {
         xhi = 1.0 - xhi;
         xlo = -xlo;
     }
 
     // cosPi zeros are all +0
-    if( xhi == 0.0 && xlo == 0.0 )
-        return 0.0;
+    if (xhi == 0.0 && xlo == 0.0) return 0.0;
 
     xhi *= M_PI;
     xlo *= M_PI;
 
     xhi += xlo;
 
-    return reference_sinl( xhi );
+    return reference_sinl(xhi);
 
 #else
     // phase adjust
     x += 0.5L;
 
     // reduce to [-0.5, 0.5]
-    if( x < -0.5L )
+    if (x < -0.5L)
         x = -1.0L - x;
-    else if ( x > 0.5L )
+    else if (x > 0.5L)
         x = 1.0L - x;
 
     // cosPi zeros are all +0
-    if( x == 0.0L )
-        return 0.0L;
+    if (x == 0.0L) return 0.0L;
 
-    return reference_sinl( x * M_PIL );
+    return reference_sinl(x * M_PIL);
 #endif
 }
 
-long double reference_dividel( long double x, long double y)
+long double reference_dividel(long double x, long double y)
 {
     double dx = x;
     double dy = y;
-    return dx/dy;
+    return dx / dy;
 }
 
-typedef struct{ double hi, lo; } double_double;
-
-// Split doubles_double into a series of consecutive 26-bit precise doubles and a remainder.
-// Note for later -- for multiplication, it might be better to split each double into a power of two and two 26 bit portions
-//                      multiplication of a double double by a known power of two is cheap. The current approach causes some inexact arithmetic in mul_dd.
-static inline void split_dd( double_double x, double_double *hi, double_double *lo )
+typedef struct
 {
-    union{ double d; cl_ulong u;}u;
+    double hi, lo;
+} double_double;
+
+// Split doubles_double into a series of consecutive 26-bit precise doubles and
+// a remainder. Note for later -- for multiplication, it might be better to
+// split each double into a power of two and two 26 bit portions
+//                      multiplication of a double double by a known power of
+//                      two is cheap. The current approach causes some inexact
+//                      arithmetic in mul_dd.
+static inline void split_dd(double_double x, double_double *hi,
+                            double_double *lo)
+{
+    union {
+        double d;
+        cl_ulong u;
+    } u;
     u.d = x.hi;
     u.u &= 0xFFFFFFFFF8000000ULL;
     hi->hi = u.d;
@@ -2025,10 +2373,10 @@ static inline void split_dd( double_double x, double_double *hi, double_double *
     lo->lo = x.hi + x.lo;
 }
 
-static inline double_double accum_d( double_double a, double b )
+static inline double_double accum_d(double_double a, double b)
 {
     double temp;
-    if( fabs(b) > fabs(a.hi) )
+    if (fabs(b) > fabs(a.hi))
     {
         temp = a.hi;
         a.hi += b;
@@ -2041,47 +2389,45 @@ static inline double_double accum_d( double_double a, double b )
         a.lo += b - (a.hi - temp);
     }
 
-    if( isnan( a.lo ) )
-        a.lo = 0.0;
+    if (isnan(a.lo)) a.lo = 0.0;
 
     return a;
 }
 
-static inline double_double add_dd( double_double a, double_double b )
+static inline double_double add_dd(double_double a, double_double b)
 {
-    double_double r = {-0.0 -0.0 };
+    double_double r = { -0.0 - 0.0 };
 
-    if( isinf(a.hi) || isinf( b.hi )  ||
-       isnan(a.hi) || isnan( b.hi )  ||
-       0.0 == a.hi || 0.0 == b.hi )
+    if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi
+        || 0.0 == b.hi)
     {
         r.hi = a.hi + b.hi;
         r.lo = a.lo + b.lo;
-        if( isnan( r.lo ) )
-            r.lo = 0.0;
+        if (isnan(r.lo)) r.lo = 0.0;
         return r;
     }
 
-    //merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|, |b.hi| > |b.lo|, so we don't have to do the first merge pass
+    // merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|,
+    // |b.hi| > |b.lo|, so we don't have to do the first merge pass
     double terms[4] = { a.hi, b.hi, a.lo, b.lo };
     double temp;
 
-    //Sort hi terms
-    if( fabs(terms[0]) < fabs(terms[1]) )
+    // Sort hi terms
+    if (fabs(terms[0]) < fabs(terms[1]))
     {
         temp = terms[0];
         terms[0] = terms[1];
         terms[1] = temp;
     }
-    //sort lo terms
-    if( fabs(terms[2]) < fabs(terms[3]) )
+    // sort lo terms
+    if (fabs(terms[2]) < fabs(terms[3]))
     {
         temp = terms[2];
         terms[2] = terms[3];
         terms[3] = temp;
     }
     // Fix case where small high term is less than large low term
-    if( fabs(terms[1]) < fabs(terms[2]) )
+    if (fabs(terms[1]) < fabs(terms[2]))
     {
         temp = terms[1];
         terms[1] = terms[2];
@@ -2104,42 +2450,40 @@ static inline double_double add_dd( double_double a, double_double b )
     temp = r.hi;
     r.hi += r.lo;
     r.lo = r.lo - (r.hi - temp);
-    if( isnan( r.lo ) )
-        r.lo = 0.0;
+    if (isnan(r.lo)) r.lo = 0.0;
 
     return r;
 }
 
-static inline double_double mul_dd( double_double a, double_double b )
+static inline double_double mul_dd(double_double a, double_double b)
 {
-    double_double result = {-0.0,-0.0};
+    double_double result = { -0.0, -0.0 };
 
     // Inf, nan and 0
-    if( isnan( a.hi ) || isnan( b.hi ) ||
-       isinf( a.hi ) || isinf( b.hi ) ||
-       0.0 == a.hi || 0.0 == b.hi )
+    if (isnan(a.hi) || isnan(b.hi) || isinf(a.hi) || isinf(b.hi) || 0.0 == a.hi
+        || 0.0 == b.hi)
     {
         result.hi = a.hi * b.hi;
         return result;
     }
 
     double_double ah, al, bh, bl;
-    split_dd( a, &ah, &al );
-    split_dd( b, &bh, &bl );
+    split_dd(a, &ah, &al);
+    split_dd(b, &bh, &bl);
 
-    double p0 = ah.hi * bh.hi;        // exact    (52 bits in product) 0
-    double p1 = ah.hi * bh.lo;        // exact    (52 bits in product) 26
-    double p2 = ah.lo * bh.hi;        // exact    (52 bits in product) 26
-    double p3 = ah.lo * bh.lo;        // exact    (52 bits in product) 52
-    double p4 = al.hi * bh.hi;        // exact    (52 bits in product) 52
-    double p5 = al.hi * bh.lo;        // exact    (52 bits in product) 78
-    double p6 = al.lo * bh.hi;        // inexact  (54 bits in product) 78
-    double p7 = al.lo * bh.lo;        // inexact  (54 bits in product) 104
-    double p8 = ah.hi * bl.hi;        // exact    (52 bits in product) 52
-    double p9 = ah.hi * bl.lo;        // inexact  (54 bits in product) 78
-    double pA = ah.lo * bl.hi;        // exact    (52 bits in product) 78
-    double pB = ah.lo * bl.lo;        // inexact  (54 bits in product) 104
-    double pC = al.hi * bl.hi;        // exact    (52 bits in product) 104
+    double p0 = ah.hi * bh.hi; // exact    (52 bits in product) 0
+    double p1 = ah.hi * bh.lo; // exact    (52 bits in product) 26
+    double p2 = ah.lo * bh.hi; // exact    (52 bits in product) 26
+    double p3 = ah.lo * bh.lo; // exact    (52 bits in product) 52
+    double p4 = al.hi * bh.hi; // exact    (52 bits in product) 52
+    double p5 = al.hi * bh.lo; // exact    (52 bits in product) 78
+    double p6 = al.lo * bh.hi; // inexact  (54 bits in product) 78
+    double p7 = al.lo * bh.lo; // inexact  (54 bits in product) 104
+    double p8 = ah.hi * bl.hi; // exact    (52 bits in product) 52
+    double p9 = ah.hi * bl.lo; // inexact  (54 bits in product) 78
+    double pA = ah.lo * bl.hi; // exact    (52 bits in product) 78
+    double pB = ah.lo * bl.lo; // inexact  (54 bits in product) 104
+    double pC = al.hi * bl.hi; // exact    (52 bits in product) 104
     // the last 3 terms are two low to appear in the result
 
 
@@ -2169,46 +2513,60 @@ static inline double_double mul_dd( double_double a, double_double b )
 
     return result;
 #else
-    // take advantage of the known relative magnitudes of the partial products to avoid some sorting
-    // Combine 2**-78 and 2**-104 terms. Here we are a bit sloppy about canonicalizing the double_doubles
+    // take advantage of the known relative magnitudes of the partial products
+    // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit
+    // sloppy about canonicalizing the double_doubles
     double_double t0 = { pA, pC };
     double_double t1 = { p9, pB };
     double_double t2 = { p6, p7 };
     double temp0, temp1, temp2;
 
-    t0 = accum_d( t0, p5 );  // there is an extra 2**-78 term to deal with
+    t0 = accum_d(t0, p5); // there is an extra 2**-78 term to deal with
 
-    // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the double_doubles
-    temp0 = t0.hi;      temp1 = t1.hi;      temp2 = t2.hi;
-    t0.hi += p3;        t1.hi += p4;        t2.hi += p8;
-    temp0 -= t0.hi-p3;  temp1 -= t1.hi-p4;  temp2 -= t2.hi - p8;
-    t0.lo += temp0;     t1.lo += temp1;     t2.lo += temp2;
+    // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the
+    // double_doubles
+    temp0 = t0.hi;
+    temp1 = t1.hi;
+    temp2 = t2.hi;
+    t0.hi += p3;
+    t1.hi += p4;
+    t2.hi += p8;
+    temp0 -= t0.hi - p3;
+    temp1 -= t1.hi - p4;
+    temp2 -= t2.hi - p8;
+    t0.lo += temp0;
+    t1.lo += temp1;
+    t2.lo += temp2;
 
-    // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the double_doubles
-    temp1 = t1.hi;      temp2 = t2.hi;
-    t1.hi += p1;        t2.hi += p2;
-    temp1 -= t1.hi-p1;  temp2 -= t2.hi - p2;
-    t1.lo += temp1;     t2.lo += temp2;
+    // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the
+    // double_doubles
+    temp1 = t1.hi;
+    temp2 = t2.hi;
+    t1.hi += p1;
+    t2.hi += p2;
+    temp1 -= t1.hi - p1;
+    temp2 -= t2.hi - p2;
+    t1.lo += temp1;
+    t2.lo += temp2;
 
     // Combine accumulators to get the low bits of result
-    t1 = add_dd( t1, add_dd( t2, t0 ) );
+    t1 = add_dd(t1, add_dd(t2, t0));
 
     // Add in MSB's, and round to precision
-    return accum_d( t1, p0 );  // canonicalizes
+    return accum_d(t1, p0); // canonicalizes
 #endif
-
 }
 
 
-long double reference_exp10l( long double z )
+long double reference_exp10l(long double z)
 {
-    const double_double log2_10 = { HEX_DBL( +, 1, a934f0979a371, +, 1 ), HEX_DBL( +, 1, 7f2495fb7fa6d, -, 53 ) };
+    const double_double log2_10 = { HEX_DBL(+, 1, a934f0979a371, +, 1),
+                                    HEX_DBL(+, 1, 7f2495fb7fa6d, -, 53) };
     double_double x;
     int j;
 
     // Handle NaNs
-    if( isnan(z) )
-        return z;
+    if (isnan(z)) return z;
 
     // init x
     x.hi = z;
@@ -2217,172 +2575,195 @@ long double reference_exp10l( long double z )
 
     // 10**x = exp2( x * log2(10) )
 
-    x = mul_dd( x, log2_10);    // x * log2(10)
+    x = mul_dd(x, log2_10); // x * log2(10)
 
-    //Deal with overflow and underflow for exp2(x) stage next
-    if( x.hi >= 1025 )
-        return INFINITY;
+    // Deal with overflow and underflow for exp2(x) stage next
+    if (x.hi >= 1025) return INFINITY;
 
-    if( x.hi < -1075-24 )
-        return +0.0;
+    if (x.hi < -1075 - 24) return +0.0;
 
     // find nearest integer to x
-    int i = (int) rint(x.hi);
+    int i = (int)rint(x.hi);
 
     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
     x.hi -= i;
 
-    // We could attempt to find a minimax polynomial for exp2(x) over the range x = [-0.5, 0.5].
-    // However, this would converge very slowly near the extrema, where 0.5**n is not a lot different
-    // from 0.5**(n+1), thereby requiring something like a 20th order polynomial to get 53 + 24 bits
-    // of precision. Instead we further reduce the range to [-1/32, 1/32] by observing that
+    // We could attempt to find a minimax polynomial for exp2(x) over the range
+    // x = [-0.5, 0.5]. However, this would converge very slowly near the
+    // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
+    // requiring something like a 20th order polynomial to get 53 + 24 bits of
+    // precision. Instead we further reduce the range to [-1/32, 1/32] by
+    // observing that
     //
     //  2**(a+b) = 2**a * 2**b
     //
-    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and reduce the range
-    // of x to [-1/32, 1/32] by subtracting away the nearest value of n/16 from x.
-    const double_double corrections[17] =
-    {
-        { HEX_DBL( +, 1, 6a09e667f3bcd, -, 1 ), HEX_DBL( -, 1, bdd3413b26456, -, 55 ) },
-        { HEX_DBL( +, 1, 7a11473eb0187, -, 1 ), HEX_DBL( -, 1, 41577ee04992f, -, 56 ) },
-        { HEX_DBL( +, 1, 8ace5422aa0db, -, 1 ), HEX_DBL( +, 1, 6e9f156864b27, -, 55 ) },
-        { HEX_DBL( +, 1, 9c49182a3f09,  -, 1 ), HEX_DBL( +, 1, c7c46b071f2be, -, 57 ) },
-        { HEX_DBL( +, 1, ae89f995ad3ad, -, 1 ), HEX_DBL( +, 1, 7a1cd345dcc81, -, 55 ) },
-        { HEX_DBL( +, 1, c199bdd85529c, -, 1 ), HEX_DBL( +, 1, 11065895048dd, -, 56 ) },
-        { HEX_DBL( +, 1, d5818dcfba487, -, 1 ), HEX_DBL( +, 1, 2ed02d75b3707, -, 56 ) },
-        { HEX_DBL( +, 1, ea4afa2a490da, -, 1 ), HEX_DBL( -, 1, e9c23179c2893, -, 55 ) },
-        { HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ) },
-        { HEX_DBL( +, 1, 0b5586cf9890f, +, 0 ), HEX_DBL( +, 1, 8a62e4adc610b, -, 54 ) },
-        { HEX_DBL( +, 1, 172b83c7d517b, +, 0 ), HEX_DBL( -, 1, 19041b9d78a76, -, 55 ) },
-        { HEX_DBL( +, 1, 2387a6e756238, +, 0 ), HEX_DBL( +, 1, 9b07eb6c70573, -, 54 ) },
-        { HEX_DBL( +, 1, 306fe0a31b715, +, 0 ), HEX_DBL( +, 1, 6f46ad23182e4, -, 55 ) },
-        { HEX_DBL( +, 1, 3dea64c123422, +, 0 ), HEX_DBL( +, 1, ada0911f09ebc, -, 55 ) },
-        { HEX_DBL( +, 1, 4bfdad5362a27, +, 0 ), HEX_DBL( +, 1, d4397afec42e2, -, 56 ) },
-        { HEX_DBL( +, 1, 5ab07dd485429, +, 0 ), HEX_DBL( +, 1, 6324c054647ad, -, 54 ) },
-        { HEX_DBL( +, 1, 6a09e667f3bcd, +, 0 ), HEX_DBL( -, 1, bdd3413b26456, -, 54 ) }
+    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
+    // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
+    // value of n/16 from x.
+    const double_double corrections[17] = {
+        { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
+          HEX_DBL(-, 1, bdd3413b26456, -, 55) },
+        { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
+          HEX_DBL(-, 1, 41577ee04992f, -, 56) },
+        { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
+          HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
+        { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
+          HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
+        { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
+          HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
+        { HEX_DBL(+, 1, c199bdd85529c, -, 1),
+          HEX_DBL(+, 1, 11065895048dd, -, 56) },
+        { HEX_DBL(+, 1, d5818dcfba487, -, 1),
+          HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
+        { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
+          HEX_DBL(-, 1, e9c23179c2893, -, 55) },
+        { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+        { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
+          HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
+        { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
+          HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
+        { HEX_DBL(+, 1, 2387a6e756238, +, 0),
+          HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
+        { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
+          HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
+        { HEX_DBL(+, 1, 3dea64c123422, +, 0),
+          HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
+        { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
+          HEX_DBL(+, 1, d4397afec42e2, -, 56) },
+        { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
+          HEX_DBL(+, 1, 6324c054647ad, -, 54) },
+        { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
+          HEX_DBL(-, 1, bdd3413b26456, -, 54) }
 
     };
-    int index = (int) rint( x.hi * 16.0 );
-    x.hi -= (double) index * 0.0625;
+    int index = (int)rint(x.hi * 16.0);
+    x.hi -= (double)index * 0.0625;
 
     // canonicalize x
     double temp = x.hi;
     x.hi += x.lo;
     x.lo -= x.hi - temp;
 
-    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max Error: 2 * 0x1.e112p-87
-    const double_double c[] = {
-        {HEX_DBL( +, 1, 62e42fefa39ef, -,  1 ), HEX_DBL( +, 1, abc9e3ac1d244, -, 56 )},
-        {HEX_DBL( +, 1, ebfbdff82c58f, -,  3 ), HEX_DBL( -, 1, 5e4987a631846, -, 57 )},
-        {HEX_DBL( +, 1, c6b08d704a0c,  -,  5 ), HEX_DBL( -, 1, d323200a05713, -, 59 )},
-        {HEX_DBL( +, 1, 3b2ab6fba4e7a, -,  7 ), HEX_DBL( +, 1, c5ee8f8b9f0c1, -, 63 )},
-        {HEX_DBL( +, 1, 5d87fe78a672a, -, 10 ), HEX_DBL( +, 1, 884e5e5cc7ecc, -, 64 )},
-        {HEX_DBL( +, 1, 430912f7e8373, -, 13 ), HEX_DBL( +, 1, 4f1b59514a326, -, 67 )},
-        {HEX_DBL( +, 1, ffcbfc5985e71, -, 17 ), HEX_DBL( -, 1, db7d6a0953b78, -, 71 )},
-        {HEX_DBL( +, 1, 62c150eb16465, -, 20 ), HEX_DBL( +, 1, e0767c2d7abf5, -, 80 )},
-        {HEX_DBL( +, 1, b52502b5e953,  -, 24 ), HEX_DBL( +, 1, 6797523f944bc, -, 78 )}
-    };
-    size_t count = sizeof( c ) / sizeof( c[0] );
+    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
+    // Error: 2 * 0x1.e112p-87
+    const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
+                                  HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
+                                { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
+                                  HEX_DBL(-, 1, 5e4987a631846, -, 57) },
+                                { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
+                                  HEX_DBL(-, 1, d323200a05713, -, 59) },
+                                { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
+                                  HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
+                                { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
+                                  HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
+                                { HEX_DBL(+, 1, 430912f7e8373, -, 13),
+                                  HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
+                                { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
+                                  HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
+                                { HEX_DBL(+, 1, 62c150eb16465, -, 20),
+                                  HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
+                                { HEX_DBL(+, 1, b52502b5e953, -, 24),
+                                  HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
+    size_t count = sizeof(c) / sizeof(c[0]);
 
     // Do polynomial
-    double_double r = c[count-1];
-    for( j = (int) count-2; j >= 0; j-- )
-        r = add_dd( c[j], mul_dd( r, x ) );
+    double_double r = c[count - 1];
+    for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
 
     // unwind approximation
-    r = mul_dd( r, x );     // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
+    r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
 
     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
     //  exp2(x) = (r + 1) * correction = r * correction + correction
-    r = mul_dd( r, corrections[index+8] );
-    r = add_dd( r, corrections[index+8] );
+    r = mul_dd(r, corrections[index + 8]);
+    r = add_dd(r, corrections[index + 8]);
 
-// Format result for output:
+    // Format result for output:
 
     // Get mantissa
-    long double m = ((long double) r.hi + (long double) r.lo );
+    long double m = ((long double)r.hi + (long double)r.lo);
 
     // Handle a pesky overflow cases when long double = double
-    if( i > 512 )
+    if (i > 512)
     {
-        m *=  HEX_DBL( +, 1, 0, +, 512 );
+        m *= HEX_DBL(+, 1, 0, +, 512);
         i -= 512;
     }
-    else if( i < -512 )
+    else if (i < -512)
     {
-        m *= HEX_DBL( +, 1, 0, -, 512 );
+        m *= HEX_DBL(+, 1, 0, -, 512);
         i += 512;
     }
 
-    return m * ldexpl( 1.0L, i );
+    return m * ldexpl(1.0L, i);
 }
 
 
-static double fallback_frexp( double x, int *iptr )
+static double fallback_frexp(double x, int *iptr)
 {
     cl_ulong u, v;
     double fu, fv;
 
-    memcpy( &u, &x, sizeof(u));
+    memcpy(&u, &x, sizeof(u));
 
-    cl_ulong exponent = u &  0x7ff0000000000000ULL;
+    cl_ulong exponent = u & 0x7ff0000000000000ULL;
     cl_ulong mantissa = u & ~0x7ff0000000000000ULL;
 
     // add 1 to the exponent
     exponent += 0x0010000000000000ULL;
 
-    if( (cl_long) exponent < (cl_long) 0x0020000000000000LL )
+    if ((cl_long)exponent < (cl_long)0x0020000000000000LL)
     { // subnormal, NaN, Inf
         mantissa |= 0x3fe0000000000000ULL;
 
         v = mantissa & 0xfff0000000000000ULL;
         u = mantissa;
-        memcpy( &fv, &v, sizeof(v));
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fv, &v, sizeof(v));
+        memcpy(&fu, &u, sizeof(u));
 
         fu -= fv;
 
-        memcpy( &v, &fv, sizeof(v));
-        memcpy( &u, &fu, sizeof(u));
+        memcpy(&v, &fv, sizeof(v));
+        memcpy(&u, &fu, sizeof(u));
 
-        exponent = u &  0x7ff0000000000000ULL;
+        exponent = u & 0x7ff0000000000000ULL;
         mantissa = u & ~0x7ff0000000000000ULL;
 
-        *iptr = (exponent >> 52) + (-1022 + 1 -1022);
+        *iptr = (exponent >> 52) + (-1022 + 1 - 1022);
         u = mantissa | 0x3fe0000000000000ULL;
-        memcpy( &fu, &u, sizeof(u));
+        memcpy(&fu, &u, sizeof(u));
         return fu;
     }
 
     *iptr = (exponent >> 52) - 1023;
     u = mantissa | 0x3fe0000000000000ULL;
-    memcpy( &fu, &u, sizeof(u));
+    memcpy(&fu, &u, sizeof(u));
     return fu;
 }
 
 // Assumes zeros, infinities and NaNs handed elsewhere
-static inline int extract( double x, cl_ulong *mant );
-static inline int extract( double x, cl_ulong *mant )
+static inline int extract(double x, cl_ulong *mant);
+static inline int extract(double x, cl_ulong *mant)
 {
-    static double (*frexpp)(double, int*) = NULL;
+    static double (*frexpp)(double, int *) = NULL;
     int e;
 
     // verify that frexp works properly
-    if( NULL == frexpp )
+    if (NULL == frexpp)
     {
-        if( 0.5 == frexp( HEX_DBL( +, 1, 0, -, 1030 ), &e ) && e == -1029 )
+        if (0.5 == frexp(HEX_DBL(+, 1, 0, -, 1030), &e) && e == -1029)
             frexpp = frexp;
         else
             frexpp = fallback_frexp;
     }
 
-    *mant = (cl_ulong) (HEX_DBL( +, 1, 0, +, 64 ) * fabs( frexpp( x, &e )));
+    *mant = (cl_ulong)(HEX_DBL(+, 1, 0, +, 64) * fabs(frexpp(x, &e)));
     return e - 1;
 }
 
 // Return 128-bit product of a*b  as (hi << 64) + lo
-static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo );
-static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo )
+static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo);
+static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo)
 {
     cl_ulong alo = a & 0xffffffffULL;
     cl_ulong ahi = a >> 32;
@@ -2393,16 +2774,22 @@ static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo )
     cl_ulong ahiblo = ahi * blo;
     cl_ulong ahibhi = ahi * bhi;
 
-    alobhi += (aloblo >> 32) + (ahiblo & 0xffffffffULL);  // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   = (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
-    *hi = ahibhi + (alobhi >> 32) + (ahiblo >> 32);       // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   = (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
+    alobhi += (aloblo >> 32)
+        + (ahiblo
+           & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
+                             // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
+    *hi = ahibhi + (alobhi >> 32)
+        + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1)   =
+                          // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1
     *lo = (aloblo & 0xffffffffULL) | (alobhi << 32);
 }
 
 // Move the most significant non-zero bit to the MSB
-// Note: not general. Only works if the most significant non-zero bit is at MSB-1
-static inline void renormalize( cl_ulong *hi, cl_ulong *lo, int *exponent )
+// Note: not general. Only works if the most significant non-zero bit is at
+// MSB-1
+static inline void renormalize(cl_ulong *hi, cl_ulong *lo, int *exponent)
 {
-    if( 0 == (0x8000000000000000ULL & *hi ))
+    if (0 == (0x8000000000000000ULL & *hi))
     {
         *hi <<= 1;
         *hi |= *lo >> 63;
@@ -2411,74 +2798,84 @@ static inline void renormalize( cl_ulong *hi, cl_ulong *lo, int *exponent )
     }
 }
 
-static double round_to_nearest_even_double( cl_ulong hi, cl_ulong lo, int exponent );
-static double round_to_nearest_even_double( cl_ulong hi, cl_ulong lo, int exponent )
+static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
+                                           int exponent);
+static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo,
+                                           int exponent)
 {
-    union{ cl_ulong u; cl_double d;} u;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } u;
 
     // edges
-    if( exponent > 1023 )        return INFINITY;
-    if( exponent == -1075 && (hi | (lo!=0)) > 0x8000000000000000ULL )
-        return HEX_DBL( +, 1, 0, -, 1074 );
-    if( exponent <= -1075 )       return 0.0;
+    if (exponent > 1023) return INFINITY;
+    if (exponent == -1075 && (hi | (lo != 0)) > 0x8000000000000000ULL)
+        return HEX_DBL(+, 1, 0, -, 1074);
+    if (exponent <= -1075) return 0.0;
 
-    //Figure out which bits go where
+    // Figure out which bits go where
     int shift = 11;
-    if( exponent < -1022 )
+    if (exponent < -1022)
     {
-        shift -= 1022 + exponent;               // subnormal: shift is not 52
-        exponent = -1023;                       //              set exponent to 0
+        shift -= 1022 + exponent; // subnormal: shift is not 52
+        exponent = -1023; //              set exponent to 0
     }
     else
-        hi &= 0x7fffffffffffffffULL;           // normal: leading bit is implicit. Remove it.
+        hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove
+                                     // it.
 
     // Assemble the double (round toward zero)
-    u.u = (hi >> shift) | ((cl_ulong) (exponent + 1023) << 52);
+    u.u = (hi >> shift) | ((cl_ulong)(exponent + 1023) << 52);
 
     // put a representation of the residual bits into hi
-    hi <<= (64-shift);
+    hi <<= (64 - shift);
     hi |= lo >> shift;
-    lo <<= (64-shift );
+    lo <<= (64 - shift);
     hi |= lo != 0;
 
-    //round to nearest, ties to even
-    if( hi < 0x8000000000000000ULL )    return u.d;
-    if( hi == 0x8000000000000000ULL )   u.u += u.u & 1ULL;
-    else                                u.u++;
+    // round to nearest, ties to even
+    if (hi < 0x8000000000000000ULL) return u.d;
+    if (hi == 0x8000000000000000ULL)
+        u.u += u.u & 1ULL;
+    else
+        u.u++;
 
     return u.d;
 }
 
-// Shift right.  Bits lost on the right will be OR'd together and OR'd with the LSB
-static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift );
-static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift )
+// Shift right.  Bits lost on the right will be OR'd together and OR'd with the
+// LSB
+static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo,
+                                          int shift);
+static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift)
 {
     cl_ulong sticky = 0;
     cl_ulong h = *hi;
     cl_ulong l = *lo;
 
-    if( shift >= 64 )
+    if (shift >= 64)
     {
         shift -= 64;
         sticky = 0 != lo;
         l = h;
         h = 0;
-        if( shift >= 64 )
+        if (shift >= 64)
         {
             sticky |= (0 != l);
             l = 0;
         }
         else
         {
-            sticky |= (0 != (l << (64-shift)));
+            sticky |= (0 != (l << (64 - shift)));
             l >>= shift;
         }
     }
     else
     {
-        sticky |= (0 != (l << (64-shift)));
+        sticky |= (0 != (l << (64 - shift)));
         l >>= shift;
-        l |=  h << (64-shift);
+        l |= h << (64 - shift);
         h >>= shift;
     }
 
@@ -2487,9 +2884,12 @@ static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift
 }
 
 // 128-bit add  of ((*hi << 64) + *lo) + ((chi << 64) + clo)
-// If the 129 bit result doesn't fit, bits lost off the right end will be OR'd with the LSB
-static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong clo, int *exp );
-static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong clo, int *exponent )
+// If the 129 bit result doesn't fit, bits lost off the right end will be OR'd
+// with the LSB
+static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
+                          cl_ulong clo, int *exp);
+static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi,
+                          cl_ulong clo, int *exponent)
 {
     cl_ulong carry, carry2;
     // extended precision add
@@ -2497,15 +2897,16 @@ static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong cl
     chi = add_carry(*hi, chi, &carry2);
     chi = add_carry(chi, carry, &carry);
 
-    //If we overflowed the 128 bit result
-    if( carry || carry2 )
+    // If we overflowed the 128 bit result
+    if (carry || carry2)
     {
-        carry = clo & 1;                        // set aside low bit
-        clo >>= 1;                              // right shift low 1
-        clo |= carry;                           // or back in the low bit, so we don't come to believe this is an exact half way case for rounding
-        clo |= chi << 63;                       // move lowest high bit into highest bit of lo
-        chi >>= 1;                              // right shift hi
-        chi |= 0x8000000000000000ULL;           // move the carry bit into hi.
+        carry = clo & 1; // set aside low bit
+        clo >>= 1; // right shift low 1
+        clo |= carry; // or back in the low bit, so we don't come to believe
+                      // this is an exact half way case for rounding
+        clo |= chi << 63; // move lowest high bit into highest bit of lo
+        chi >>= 1; // right shift hi
+        chi |= 0x8000000000000000ULL; // move the carry bit into hi.
         *exponent = *exponent + 1;
     }
 
@@ -2514,48 +2915,51 @@ static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong cl
 }
 
 // 128-bit subtract  of ((chi << 64) + clo)  - ((*hi << 64) + *lo)
-static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong lo, cl_ulong *signC, int *expC );
-static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong lo, cl_ulong *signC, int *expC )
+static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
+                          cl_ulong lo, cl_ulong *signC, int *expC);
+static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi,
+                          cl_ulong lo, cl_ulong *signC, int *expC)
 {
     cl_ulong rHi = *chi;
     cl_ulong rLo = *clo;
     cl_ulong carry, carry2;
 
-    //extended precision subtract
+    // extended precision subtract
     rLo = sub_carry(rLo, lo, &carry);
     rHi = sub_carry(rHi, hi, &carry2);
     rHi = sub_carry(rHi, carry, &carry);
 
     // Check for sign flip
-    if( carry || carry2 )
+    if (carry || carry2)
     {
         *signC ^= 0x8000000000000000ULL;
 
-        //negate rLo, rHi:   -x = (x ^ -1) + 1
+        // negate rLo, rHi:   -x = (x ^ -1) + 1
         rLo ^= -1ULL;
         rHi ^= -1ULL;
         rLo++;
         rHi += 0 == rLo;
     }
 
-    // normalize -- move the most significant non-zero bit to the MSB, and adjust exponent accordingly
-    if( rHi == 0 )
+    // normalize -- move the most significant non-zero bit to the MSB, and
+    // adjust exponent accordingly
+    if (rHi == 0)
     {
         rHi = rLo;
         *expC = *expC - 64;
         rLo = 0;
     }
 
-    if( rHi )
+    if (rHi)
     {
         int shift = 32;
         cl_ulong test = 1ULL << 32;
-        while( 0 == (rHi & 0x8000000000000000ULL))
+        while (0 == (rHi & 0x8000000000000000ULL))
         {
-            if( rHi < test )
+            if (rHi < test)
             {
                 rHi <<= shift;
-                rHi |= rLo >> (64-shift);
+                rHi |= rLo >> (64 - shift);
                 rLo <<= shift;
                 *expC = *expC - shift;
             }
@@ -2565,7 +2969,7 @@ static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong l
     }
     else
     {
-        //zero
+        // zero
         *expC = INT_MIN;
         *signC = 0;
     }
@@ -2575,7 +2979,7 @@ static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong l
     *clo = rLo;
 }
 
-long double reference_fmal( long double x, long double y, long double z)
+long double reference_fmal(long double x, long double y, long double z)
 {
     static const cl_ulong kMSB = 0x8000000000000000ULL;
 
@@ -2585,75 +2989,91 @@ long double reference_fmal( long double x, long double y, long double z)
     double c = z;
 
     // Make bits accessible
-    union{ cl_ulong u; cl_double d; } ua; ua.d = a;
-    union{ cl_ulong u; cl_double d; } ub; ub.d = b;
-    union{ cl_ulong u; cl_double d; } uc; uc.d = c;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } ua;
+    ua.d = a;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } ub;
+    ub.d = b;
+    union {
+        cl_ulong u;
+        cl_double d;
+    } uc;
+    uc.d = c;
 
     // deal with Nans, infinities and zeros
-    if( isnan( a ) || isnan( b ) || isnan(c)    ||
-        isinf( a ) || isinf( b ) || isinf(c)    ||
-        0 == ( ua.u & ~kMSB)                ||  // a == 0, defeat host FTZ behavior
-        0 == ( ub.u & ~kMSB)                ||  // b == 0, defeat host FTZ behavior
-        0 == ( uc.u & ~kMSB)                )   // c == 0, defeat host FTZ behavior
+    if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c)
+        || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior
+        0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior
+        0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior
     {
-        if( isinf( c ) && !isinf(a) && !isinf(b) )
-            return (c + a) + b;
+        if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b;
 
-        a = (double) reference_multiplyl( a, b );   // some risk that the compiler will insert a non-compliant fma here on some platforms.
-        return reference_addl(a, c);                // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
+        a = (double)reference_multiplyl(
+            a, b); // some risk that the compiler will insert a non-compliant
+                   // fma here on some platforms.
+        return reference_addl(
+            a,
+            c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that.
     }
 
     // extract exponent and mantissa
     //   exponent is a standard unbiased signed integer
     //   mantissa is a cl_uint, with leading non-zero bit positioned at the MSB
     cl_ulong mantA, mantB, mantC;
-    int expA = extract( a, &mantA );
-    int expB = extract( b, &mantB );
-    int expC = extract( c, &mantC );
-    cl_ulong signC = uc.u & kMSB;               // We'll need the sign bit of C later to decide if we are adding or subtracting
+    int expA = extract(a, &mantA);
+    int expB = extract(b, &mantB);
+    int expC = extract(c, &mantC);
+    cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to
+                                  // decide if we are adding or subtracting
 
-// exact product of A and B
+    // exact product of A and B
     int exponent = expA + expB;
     cl_ulong sign = (ua.u ^ ub.u) & kMSB;
     cl_ulong hi, lo;
-    mul128( mantA, mantB, &hi, &lo );
+    mul128(mantA, mantB, &hi, &lo);
 
     // renormalize
-    if( 0 == (kMSB & hi) )
+    if (0 == (kMSB & hi))
     {
         hi <<= 1;
         hi |= lo >> 63;
         lo <<= 1;
     }
     else
-        exponent++;         // 2**63 * 2**63 gives 2**126. If the MSB was set, then our exponent increased.
+        exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our
+                    // exponent increased.
 
-//infinite precision add
+    // infinite precision add
     cl_ulong chi = mantC;
     cl_ulong clo = 0;
 
-    if( exponent >= expC )
+    if (exponent >= expC)
     {
         // Normalize C relative to the product
-        if( exponent > expC )
-            shift_right_sticky_128( &chi, &clo, exponent - expC );
+        if (exponent > expC)
+            shift_right_sticky_128(&chi, &clo, exponent - expC);
 
         // Add
-        if( sign ^ signC )
-            sub128( &hi, &lo, chi, clo, &sign, &exponent );
+        if (sign ^ signC)
+            sub128(&hi, &lo, chi, clo, &sign, &exponent);
         else
-            add128( &hi, &lo, chi, clo, &exponent );
+            add128(&hi, &lo, chi, clo, &exponent);
     }
     else
     {
         // Shift the product relative to C so that their exponents match
-        shift_right_sticky_128( &hi, &lo, expC - exponent );
+        shift_right_sticky_128(&hi, &lo, expC - exponent);
 
         // add
-        if( sign ^ signC )
-            sub128( &chi, &clo, hi, lo, &signC, &expC );
+        if (sign ^ signC)
+            sub128(&chi, &clo, hi, lo, &signC, &expC);
         else
-            add128( &chi, &clo, hi, lo, &expC );
+            add128(&chi, &clo, hi, lo, &expC);
 
         hi = chi;
         lo = clo;
@@ -2671,61 +3091,54 @@ long double reference_fmal( long double x, long double y, long double z)
 }
 
 
-
-
-long double reference_madl( long double a, long double b, long double c) { return a * b + c; }
-
-//long double my_nextafterl(long double x, long double y){  return (long double) nextafter( (double) x, (double) y ); }
-
-long double reference_recipl( long double x){ return 1.0L / x; }
-
-long double reference_rootnl( long double x, int i)
+long double reference_madl(long double a, long double b, long double c)
 {
-    double hi,  lo;
+    return a * b + c;
+}
+
+// long double my_nextafterl(long double x, long double y){  return (long
+// double) nextafter( (double) x, (double) y ); }
+
+long double reference_recipl(long double x) { return 1.0L / x; }
+
+long double reference_rootnl(long double x, int i)
+{
+    double hi, lo;
     long double l;
-    //rootn ( x, 0 )  returns a NaN.
-    if( 0 == i )
-        return cl_make_nan();
+    // rootn ( x, 0 )  returns a NaN.
+    if (0 == i) return cl_make_nan();
 
-    //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-    if( x < 0.0L && 0 == (i&1) )
-        return cl_make_nan();
+    // rootn ( x, n )  returns a NaN for x < 0 and n is even.
+    if (x < 0.0L && 0 == (i & 1)) return cl_make_nan();
 
-    if( isinf(x) )
+    if (isinf(x))
     {
-        if( i < 0 )
-            return reference_copysignl(0.0L, x);
+        if (i < 0) return reference_copysignl(0.0L, x);
 
         return x;
     }
 
-    if( x == 0.0 )
+    if (x == 0.0)
     {
-        switch( i & 0x80000001 )
+        switch (i & 0x80000001)
         {
-            //rootn ( +-0,  n ) is +0 for even n > 0.
-            case 0:
-                return 0.0L;
+            // rootn ( +-0,  n ) is +0 for even n > 0.
+            case 0: return 0.0L;
 
-            //rootn ( +-0,  n ) is +-0 for odd n > 0.
-            case 1:
-                return x;
+            // rootn ( +-0,  n ) is +-0 for odd n > 0.
+            case 1: return x;
 
-            //rootn ( +-0,  n ) is +inf for even n < 0.
-            case 0x80000000:
-                return INFINITY;
+            // rootn ( +-0,  n ) is +inf for even n < 0.
+            case 0x80000000: return INFINITY;
 
-            //rootn ( +-0,  n ) is +-inf for odd n < 0.
-            case 0x80000001:
-                return copysign(INFINITY, x);
+            // rootn ( +-0,  n ) is +-inf for odd n < 0.
+            case 0x80000001: return copysign(INFINITY, x);
         }
     }
 
-    if( i == 1 )
-        return x;
+    if (i == 1) return x;
 
-    if( i == -1 )
-        return 1.0 / x;
+    if (i == -1) return 1.0 / x;
 
     long double sign = x;
     x = reference_fabsl(x);
@@ -2733,167 +3146,174 @@ long double reference_rootnl( long double x, int i)
     DivideDD(&iHi, &iLo, 1.0, i);
     x = reference_powl(x, iHi) * reference_powl(x, iLo);
 
-    return reference_copysignl( x, sign );
-
+    return reference_copysignl(x, sign);
 }
 
-long double reference_rsqrtl( long double x){ return 1.0L / sqrtl(x); }
-//long double reference_sincosl( long double x, long double *c ){ *c = reference_cosl(x); return reference_sinl(x); }
-long double reference_sinpil( long double x)
+long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); }
+// long double reference_sincosl( long double x, long double *c ){ *c =
+// reference_cosl(x); return reference_sinl(x); }
+long double reference_sinpil(long double x)
 {
     double r = reduce1l(x);
 
     // reduce to [-0.5, 0.5]
-    if( r < -0.5L )
+    if (r < -0.5L)
         r = -1.0L - r;
-    else if ( r > 0.5L )
+    else if (r > 0.5L)
         r = 1.0L - r;
 
     // sinPi zeros have the same sign as x
-    if( r == 0.0L )
-        return reference_copysignl(0.0L, x);
+    if (r == 0.0L) return reference_copysignl(0.0L, x);
 
-    return reference_sinl( r * M_PIL );
+    return reference_sinl(r * M_PIL);
 }
 
-long double reference_tanpil( long double x)
+long double reference_tanpil(long double x)
 {
     // set aside the sign  (allows us to preserve sign of -0)
-    long double sign = reference_copysignl( 1.0L, x);
+    long double sign = reference_copysignl(1.0L, x);
     long double z = reference_fabsl(x);
 
     // if big and even  -- caution: only works if x only has single precision
-    if( z >= HEX_LDBL( +, 1, 0, +, 53 ) )
+    if (z >= HEX_LDBL(+, 1, 0, +, 53))
     {
-        if( z == INFINITY )
-            return x - x;       // nan
+        if (z == INFINITY) return x - x; // nan
 
-        return reference_copysignl( 0.0L, x);   // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
+        return reference_copysignl(
+            0.0L, x); // tanpi ( n ) is copysign( 0.0, n)  for even integers n.
     }
 
     // reduce to the range [ -0.5, 0.5 ]
-    long double nearest = reference_rintl( z );     // round to nearest even places n + 0.5 values in the right place for us
-    int64_t i = (int64_t) nearest;          // test above against 0x1.0p53 avoids overflow here
+    long double nearest =
+        reference_rintl(z); // round to nearest even places n + 0.5 values in
+                            // the right place for us
+    int64_t i =
+        (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here
     z -= nearest;
 
-    //correction for odd integer x for the right sign of zero
-    if( (i&1) && z == 0.0L )
-        sign = -sign;
+    // correction for odd integer x for the right sign of zero
+    if ((i & 1) && z == 0.0L) sign = -sign;
 
     // track changes to the sign
-    sign *= reference_copysignl(1.0L, z);       // really should just be an xor
-    z = reference_fabsl(z);                    // remove the sign again
+    sign *= reference_copysignl(1.0L, z); // really should just be an xor
+    z = reference_fabsl(z); // remove the sign again
 
     // reduce once more
-    // If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly
-    if( z > 0.25L )
+    // If we don't do this, rounding error in z * M_PI will cause us not to
+    // return infinities properly
+    if (z > 0.25L)
     {
         z = 0.5L - z;
-        return sign / reference_tanl( z * M_PIL );      // use system tan to get the right result
+        return sign
+            / reference_tanl(z
+                             * M_PIL); // use system tan to get the right result
     }
 
     //
-    return sign * reference_tanl( z * M_PIL );          // use system tan to get the right result
+    return sign
+        * reference_tanl(z * M_PIL); // use system tan to get the right result
 }
 
-long double reference_pownl( long double x, int i ){ return reference_powl( x, (long double) i ); }
-
-long double reference_powrl( long double x, long double y )
+long double reference_pownl(long double x, int i)
 {
-    //powr ( x, y ) returns NaN for x < 0.
-    if( x < 0.0L )
-        return cl_make_nan();
+    return reference_powl(x, (long double)i);
+}
 
-    //powr ( x, NaN ) returns the NaN for x >= 0.
-    //powr ( NaN, y ) returns the NaN.
-    if( isnan(x) || isnan(y) )
-        return x + y;   // Note: behavior different here than for pow(1,NaN), pow(NaN, 0)
+long double reference_powrl(long double x, long double y)
+{
+    // powr ( x, y ) returns NaN for x < 0.
+    if (x < 0.0L) return cl_make_nan();
 
-    if( x == 1.0L )
+    // powr ( x, NaN ) returns the NaN for x >= 0.
+    // powr ( NaN, y ) returns the NaN.
+    if (isnan(x) || isnan(y))
+        return x + y; // Note: behavior different here than for pow(1,NaN),
+                      // pow(NaN, 0)
+
+    if (x == 1.0L)
     {
-        //powr ( +1, +-inf ) returns NaN.
-        if( reference_fabsl(y) == INFINITY )
-            return cl_make_nan();
+        // powr ( +1, +-inf ) returns NaN.
+        if (reference_fabsl(y) == INFINITY) return cl_make_nan();
 
-        //powr ( +1, y ) is 1 for finite y.    (NaN handled above)
+        // powr ( +1, y ) is 1 for finite y.    (NaN handled above)
         return 1.0L;
     }
 
-    if( y == 0.0L )
+    if (y == 0.0L)
     {
-        //powr ( +inf, +-0 ) returns NaN.
-        //powr ( +-0, +-0 ) returns NaN.
-        if( x == 0.0L || x == INFINITY )
-            return cl_make_nan();
+        // powr ( +inf, +-0 ) returns NaN.
+        // powr ( +-0, +-0 ) returns NaN.
+        if (x == 0.0L || x == INFINITY) return cl_make_nan();
 
-        //powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already handled above)
+        // powr ( x, +-0 ) is 1 for finite x > 0.  (x <= 0, NaN, INF already
+        // handled above)
         return 1.0L;
     }
 
-    if( x == 0.0L )
+    if (x == 0.0L)
     {
-        //powr ( +-0, -inf) is +inf.
-        //powr ( +-0, y ) is +inf for finite y < 0.
-        if( y < 0.0L )
-            return INFINITY;
+        // powr ( +-0, -inf) is +inf.
+        // powr ( +-0, y ) is +inf for finite y < 0.
+        if (y < 0.0L) return INFINITY;
 
-        //powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
+        // powr ( +-0, y ) is +0 for y > 0.    (NaN, y==0 handled above)
         return 0.0L;
     }
 
-    return reference_powl( x, y );
+    return reference_powl(x, y);
 }
 
-//long double my_fdiml( long double x, long double y){ return fdim( (double) x, (double) y ); }
-long double reference_addl( long double x, long double y)
+// long double my_fdiml( long double x, long double y){ return fdim( (double) x,
+// (double) y ); }
+long double reference_addl(long double x, long double y)
 {
-    volatile double a = (double) x;
-    volatile double b = (double) y;
+    volatile double a = (double)x;
+    volatile double b = (double)y;
 
-#if defined( __SSE2__ )
+#if defined(__SSE2__)
     // defeat x87
-    __m128d va = _mm_set_sd( (double) a );
-    __m128d vb = _mm_set_sd( (double) b );
-    va = _mm_add_sd( va, vb );
-    _mm_store_sd( (double*) &a, va );
+    __m128d va = _mm_set_sd((double)a);
+    __m128d vb = _mm_set_sd((double)b);
+    va = _mm_add_sd(va, vb);
+    _mm_store_sd((double *)&a, va);
 #else
     a += b;
 #endif
-    return (long double) a;
+    return (long double)a;
 }
 
-long double reference_subtractl( long double x, long double y)
+long double reference_subtractl(long double x, long double y)
 {
-    volatile double a = (double) x;
-    volatile double b = (double) y;
+    volatile double a = (double)x;
+    volatile double b = (double)y;
 
-#if defined( __SSE2__ )
+#if defined(__SSE2__)
     // defeat x87
-    __m128d va = _mm_set_sd( (double) a );
-    __m128d vb = _mm_set_sd( (double) b );
-    va = _mm_sub_sd( va, vb );
-    _mm_store_sd( (double*) &a, va );
+    __m128d va = _mm_set_sd((double)a);
+    __m128d vb = _mm_set_sd((double)b);
+    va = _mm_sub_sd(va, vb);
+    _mm_store_sd((double *)&a, va);
 #else
     a -= b;
 #endif
-    return (long double) a;
+    return (long double)a;
 }
 
-long double reference_multiplyl( long double x, long double y)
+long double reference_multiplyl(long double x, long double y)
 {
-    volatile double a = (double) x;
-    volatile double b = (double) y;
+    volatile double a = (double)x;
+    volatile double b = (double)y;
 
-#if defined( __SSE2__ )
+#if defined(__SSE2__)
     // defeat x87
-    __m128d va = _mm_set_sd( (double) a );
-    __m128d vb = _mm_set_sd( (double) b );
-    va = _mm_mul_sd( va, vb );
-    _mm_store_sd( (double*) &a, va );
+    __m128d va = _mm_set_sd((double)a);
+    __m128d vb = _mm_set_sd((double)b);
+    va = _mm_mul_sd(va, vb);
+    _mm_store_sd((double *)&a, va);
 #else
     a *= b;
 #endif
-    return (long double) a;
+    return (long double)a;
 }
 
 /*long double my_remquol( long double x, long double y, int *iptr )
@@ -2908,22 +3328,22 @@ long double reference_multiplyl( long double x, long double y)
 
     return remquo( (double) x, (double) y, iptr );
 }*/
-long double reference_lgamma_rl( long double x, int *signp )
+long double reference_lgamma_rl(long double x, int *signp)
 {
-//    long double lgamma_val = (long double)reference_lgamma( (double)x );
-//    *signp = signgam;
+    //    long double lgamma_val = (long double)reference_lgamma( (double)x );
+    //    *signp = signgam;
     *signp = 0;
     return x;
 }
 
 
-int reference_isequall( long double x, long double y){ return x == y; }
-int reference_isfinitel( long double x){ return 0 != isfinite(x); }
-int reference_isgreaterl( long double x, long double y){ return x > y; }
-int reference_isgreaterequall( long double x, long double y){ return x >= y; }
-int reference_isinfl( long double x){ return 0 != isinf(x); }
-int reference_islessl( long double x, long double y){ return x < y; }
-int reference_islessequall( long double x, long double y){ return x <= y; }
+int reference_isequall(long double x, long double y) { return x == y; }
+int reference_isfinitel(long double x) { return 0 != isfinite(x); }
+int reference_isgreaterl(long double x, long double y) { return x > y; }
+int reference_isgreaterequall(long double x, long double y) { return x >= y; }
+int reference_isinfl(long double x) { return 0 != isinf(x); }
+int reference_islessl(long double x, long double y) { return x < y; }
+int reference_islessequall(long double x, long double y) { return x <= y; }
 #if defined(__INTEL_COMPILER)
 int reference_islessgreaterl(long double x, long double y)
 {
@@ -2935,69 +3355,77 @@ int reference_islessgreaterl(long double x, long double y)
     return 0 != islessgreater(x, y);
 }
 #endif
-int reference_isnanl( long double x){ return 0 != isnan( x ); }
-int reference_isnormall( long double x){ return 0 != isnormal( (double) x ); }
-int reference_isnotequall( long double x, long double y){ return x != y; }
-int reference_isorderedl( long double x, long double y){ return x == x && y == y; }
-int reference_isunorderedl( long double x, long double y){ return isnan(x) || isnan( y ); }
-#if defined( __INTEL_COMPILER )
-int reference_signbitl( long double x){ return 0 != signbitl( x ); }
+int reference_isnanl(long double x) { return 0 != isnan(x); }
+int reference_isnormall(long double x) { return 0 != isnormal((double)x); }
+int reference_isnotequall(long double x, long double y) { return x != y; }
+int reference_isorderedl(long double x, long double y)
+{
+    return x == x && y == y;
+}
+int reference_isunorderedl(long double x, long double y)
+{
+    return isnan(x) || isnan(y);
+}
+#if defined(__INTEL_COMPILER)
+int reference_signbitl(long double x) { return 0 != signbitl(x); }
 #else
-int reference_signbitl( long double x){ return 0 != signbit( x ); }
+int reference_signbitl(long double x) { return 0 != signbit(x); }
 #endif
-long double reference_copysignl( long double x, long double y);
-long double reference_roundl( long double x );
+long double reference_copysignl(long double x, long double y);
+long double reference_roundl(long double x);
 long double reference_cbrtl(long double x);
 
-long double reference_copysignl( long double x, long double y )
+long double reference_copysignl(long double x, long double y)
 {
-    // We hope that the long double to double conversion proceeds with sign fidelity,
-    // even for zeros and NaNs
-    union{ double d; cl_ulong u;}u; u.d = (double) y;
+    // We hope that the long double to double conversion proceeds with sign
+    // fidelity, even for zeros and NaNs
+    union {
+        double d;
+        cl_ulong u;
+    } u;
+    u.d = (double)y;
 
     x = reference_fabsl(x);
-    if( u.u >> 63 )
-        x = -x;
+    if (u.u >> 63) x = -x;
 
     return x;
 }
 
-long double reference_roundl( long double x )
+long double reference_roundl(long double x)
 {
     // Since we are just using this to verify double precision, we can
     // use the double precision copysign here
 
 #if defined(__MINGW32__) && defined(__x86_64__)
     long double absx = reference_fabsl(x);
-    if (absx < 0.5L)
-    return reference_copysignl(0.0L, x);
+    if (absx < 0.5L) return reference_copysignl(0.0L, x);
 #endif
-    return round( (double) x );
+    return round((double)x);
 }
 
-long double reference_truncl( long double x )
+long double reference_truncl(long double x)
 {
     // Since we are just using this to verify double precision, we can
     // use the double precision copysign here
-    return trunc( (double) x );
+    return trunc((double)x);
 }
 
 static long double reference_scalblnl(long double x, long n);
 
 long double reference_cbrtl(long double x)
 {
-    double yhi = HEX_DBL( +, 1, 5555555555555, -, 2 );
-    double ylo = HEX_DBL( +, 1, 558, -, 56 );
+    double yhi = HEX_DBL(+, 1, 5555555555555, -, 2);
+    double ylo = HEX_DBL(+, 1, 558, -, 56);
 
-    double fabsx = reference_fabs( x );
+    double fabsx = reference_fabs(x);
 
-    if( isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x) )
-        return x;
+    if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x;
 
     double iy = 0.0;
     double log2x_hi, log2x_lo;
 
-    // extended precision log .... accurate to at least 64-bits + couple of guard bits
+    // extended precision log .... accurate to at least 64-bits + couple of
+    // guard bits
     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
 
     double ylog2x_hi, ylog2x_lo;
@@ -3009,20 +3437,24 @@ long double reference_cbrtl(long double x)
     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
 
     long double powxy;
-    if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) {
-        powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY;
-    } else {
+    if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
+    {
+        powxy =
+            reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
+    }
+    else
+    {
         // separate integer + fractional part
         long int m = lrint(ylog2x_hi);
         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
 
         // revert to long double arithemtic
-        long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo;
-        powxy = reference_exp2l( ylog2x );
+        long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
+        powxy = reference_exp2l(ylog2x);
         powxy = reference_scalblnl(powxy, m);
     }
 
-    return reference_copysignl( powxy, x );
+    return reference_copysignl(powxy, x);
 }
 
 /*
@@ -3064,24 +3496,24 @@ long double scalbnl( long double x, int i )
 }
 */
 
-long double reference_rintl( long double x )
+long double reference_rintl(long double x)
 {
 #if defined(__PPC__)
-  // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
-  // mantissa can represent more than LDBL_MANT_DIG binary digits.
-  x = rintl(x);
+    // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
+    // mantissa can represent more than LDBL_MANT_DIG binary digits.
+    x = rintl(x);
 #else
-    static long double magic[2] = { 0.0L, 0.0L};
+    static long double magic[2] = { 0.0L, 0.0L };
 
-    if( 0.0L == magic[0] )
+    if (0.0L == magic[0])
     {
         magic[0] = scalbnl(0.5L, LDBL_MANT_DIG);
         magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG);
     }
 
-    if( reference_fabsl(x) < magic[0] && x != 0.0L )
+    if (reference_fabsl(x) < magic[0] && x != 0.0L)
     {
-        long double m = magic[ x < 0 ];
+        long double m = magic[x < 0];
         x += m;
         x -= m;
     }
@@ -3094,7 +3526,7 @@ long double reference_rintl( long double x )
 static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
 {
     // approximate reciprocal sqrt
-    double thi = 1.0 / sqrt( xhi );
+    double thi = 1.0 / sqrt(xhi);
     double tlo = 0.0;
 
     // One newton iteration in double-double
@@ -3108,34 +3540,31 @@ static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo)
     MulDD(rhi, rlo, yhi, ylo, xhi, xlo);
 }
 
-long double reference_acoshl( long double x )
+long double reference_acoshl(long double x)
 {
-/*
- * ====================================================
- * This function derived from fdlibm http://www.netlib.org
- * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- *
- */
-    if( isnan(x) || isinf(x))
-        return x + fabsl(x);
+    /*
+     * ====================================================
+     * This function derived from fdlibm http://www.netlib.org
+     * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     *
+     */
+    if (isnan(x) || isinf(x)) return x + fabsl(x);
 
-    if( x < 1.0L )
-        return cl_make_nan();
+    if (x < 1.0L) return cl_make_nan();
 
-    if( x == 1.0L )
-        return 0.0L;
+    if (x == 1.0L) return 0.0L;
 
-    if( x > HEX_LDBL( +, 1, 0, +, 60 ) )
+    if (x > HEX_LDBL(+, 1, 0, +, 60))
         return reference_logl(x) + 0.693147180559945309417232121458176568L;
 
-    if( x > 2.0L )
-        return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x*x - 1.0L)));
+    if (x > 2.0L)
+        return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L)));
 
     double hi, lo;
     MulD(&hi, &lo, x, x);
@@ -3144,286 +3573,301 @@ long double reference_acoshl( long double x )
     AddDD(&hi, &lo, hi, lo, x, 0.0);
     double correction = lo / hi;
     __log2_ep(&hi, &lo, hi);
-    double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 );
-    double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 );
+    double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
+    double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
     MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo);
     AddDD(&hi, &lo, hi, lo, correction, 0.0);
 
     return hi + lo;
 }
 
-long double reference_asinhl( long double x )
+long double reference_asinhl(long double x)
 {
     long double cutoff = 0.0L;
-    const long double ln2 = HEX_LDBL( +, b, 17217f7d1cf79ab, -, 4 );
+    const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4);
 
-    if( cutoff == 0.0L )
-        cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
+    if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG);
 
-    if( isnan(x) || isinf(x) )
-        return x + x;
+    if (isnan(x) || isinf(x)) return x + x;
 
     long double absx = reference_fabsl(x);
-    if( absx < cutoff )
-        return x;
+    if (absx < cutoff) return x;
 
     long double sign = reference_copysignl(1.0L, x);
 
-    if( absx <= 4.0/3.0 ) {
-        return sign * reference_log1pl( absx + x*x / (1.0 + sqrtl(1.0 + x*x)));
+    if (absx <= 4.0 / 3.0)
+    {
+        return sign
+            * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x)));
     }
-    else if( absx <= HEX_LDBL( +, 1, 0, +, 27 ) ) {
-        return sign * reference_logl( 2.0L * absx + 1.0L / (sqrtl( x * x + 1.0 ) + absx));
+    else if (absx <= HEX_LDBL(+, 1, 0, +, 27))
+    {
+        return sign
+            * reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx));
     }
-    else {
-        return sign * ( reference_logl( absx ) + ln2 );
+    else
+    {
+        return sign * (reference_logl(absx) + ln2);
     }
 }
 
-long double reference_atanhl( long double x )
+long double reference_atanhl(long double x)
 {
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if( isnan(x)  )
-        return x + x;
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (isnan(x)) return x + x;
 
-    long double signed_half = reference_copysignl( 0.5L, x );
+    long double signed_half = reference_copysignl(0.5L, x);
     x = reference_fabsl(x);
-    if( x > 1.0L )
-        return cl_make_nan();
+    if (x > 1.0L) return cl_make_nan();
 
-    if( x < 0.5L )
-        return signed_half * reference_log1pl( 2.0L * ( x + x*x / (1-x) ) );
+    if (x < 0.5L)
+        return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x)));
 
-    return signed_half * reference_log1pl(2.0L * x / (1-x));
+    return signed_half * reference_log1pl(2.0L * x / (1 - x));
 }
 
-long double reference_exp2l(  long double z)
+long double reference_exp2l(long double z)
 {
     double_double x;
     int j;
 
     // Handle NaNs
-    if( isnan(z) )
-        return z;
+    if (isnan(z)) return z;
 
     // init x
     x.hi = z;
     x.lo = z - x.hi;
 
-    //Deal with overflow and underflow for exp2(x) stage next
-    if( x.hi >= 1025 )
-        return INFINITY;
+    // Deal with overflow and underflow for exp2(x) stage next
+    if (x.hi >= 1025) return INFINITY;
 
-    if( x.hi < -1075-24 )
-        return +0.0;
+    if (x.hi < -1075 - 24) return +0.0;
 
     // find nearest integer to x
-    int i = (int) rint(x.hi);
+    int i = (int)rint(x.hi);
 
     // x now holds fractional part.  The result would be then 2**i  * exp2( x )
     x.hi -= i;
 
-    // We could attempt to find a minimax polynomial for exp2(x) over the range x = [-0.5, 0.5].
-    // However, this would converge very slowly near the extrema, where 0.5**n is not a lot different
-    // from 0.5**(n+1), thereby requiring something like a 20th order polynomial to get 53 + 24 bits
-    // of precision. Instead we further reduce the range to [-1/32, 1/32] by observing that
+    // We could attempt to find a minimax polynomial for exp2(x) over the range
+    // x = [-0.5, 0.5]. However, this would converge very slowly near the
+    // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby
+    // requiring something like a 20th order polynomial to get 53 + 24 bits of
+    // precision. Instead we further reduce the range to [-1/32, 1/32] by
+    // observing that
     //
     //  2**(a+b) = 2**a * 2**b
     //
-    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and reduce the range
-    // of x to [-1/32, 1/32] by subtracting away the nearest value of n/16 from x.
-    const double_double corrections[17] =
-    {
-        { HEX_DBL( +, 1, 6a09e667f3bcd, -, 1 ), HEX_DBL( -, 1, bdd3413b26456, -, 55 ) },
-        { HEX_DBL( +, 1, 7a11473eb0187, -, 1 ), HEX_DBL( -, 1, 41577ee04992f, -, 56 ) },
-        { HEX_DBL( +, 1, 8ace5422aa0db, -, 1 ), HEX_DBL( +, 1, 6e9f156864b27, -, 55 ) },
-        { HEX_DBL( +, 1, 9c49182a3f09,  -, 1 ), HEX_DBL( +, 1, c7c46b071f2be, -, 57 ) },
-        { HEX_DBL( +, 1, ae89f995ad3ad, -, 1 ), HEX_DBL( +, 1, 7a1cd345dcc81, -, 55 ) },
-        { HEX_DBL( +, 1, c199bdd85529c, -, 1 ), HEX_DBL( +, 1, 11065895048dd, -, 56 ) },
-        { HEX_DBL( +, 1, d5818dcfba487, -, 1 ), HEX_DBL( +, 1, 2ed02d75b3707, -, 56 ) },
-        { HEX_DBL( +, 1, ea4afa2a490da, -, 1 ), HEX_DBL( -, 1, e9c23179c2893, -, 55 ) },
-        { HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ) },
-        { HEX_DBL( +, 1, 0b5586cf9890f, +, 0 ), HEX_DBL( +, 1, 8a62e4adc610b, -, 54 ) },
-        { HEX_DBL( +, 1, 172b83c7d517b, +, 0 ), HEX_DBL( -, 1, 19041b9d78a76, -, 55 ) },
-        { HEX_DBL( +, 1, 2387a6e756238, +, 0 ), HEX_DBL( +, 1, 9b07eb6c70573, -, 54 ) },
-        { HEX_DBL( +, 1, 306fe0a31b715, +, 0 ), HEX_DBL( +, 1, 6f46ad23182e4, -, 55 ) },
-        { HEX_DBL( +, 1, 3dea64c123422, +, 0 ), HEX_DBL( +, 1, ada0911f09ebc, -, 55 ) },
-        { HEX_DBL( +, 1, 4bfdad5362a27, +, 0 ), HEX_DBL( +, 1, d4397afec42e2, -, 56 ) },
-        { HEX_DBL( +, 1, 5ab07dd485429, +, 0 ), HEX_DBL( +, 1, 6324c054647ad, -, 54 ) },
-        { HEX_DBL( +, 1, 6a09e667f3bcd, +, 0 ), HEX_DBL( -, 1, bdd3413b26456, -, 54 ) }
+    // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and
+    // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest
+    // value of n/16 from x.
+    const double_double corrections[17] = {
+        { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1),
+          HEX_DBL(-, 1, bdd3413b26456, -, 55) },
+        { HEX_DBL(+, 1, 7a11473eb0187, -, 1),
+          HEX_DBL(-, 1, 41577ee04992f, -, 56) },
+        { HEX_DBL(+, 1, 8ace5422aa0db, -, 1),
+          HEX_DBL(+, 1, 6e9f156864b27, -, 55) },
+        { HEX_DBL(+, 1, 9c49182a3f09, -, 1),
+          HEX_DBL(+, 1, c7c46b071f2be, -, 57) },
+        { HEX_DBL(+, 1, ae89f995ad3ad, -, 1),
+          HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) },
+        { HEX_DBL(+, 1, c199bdd85529c, -, 1),
+          HEX_DBL(+, 1, 11065895048dd, -, 56) },
+        { HEX_DBL(+, 1, d5818dcfba487, -, 1),
+          HEX_DBL(+, 1, 2ed02d75b3707, -, 56) },
+        { HEX_DBL(+, 1, ea4afa2a490da, -, 1),
+          HEX_DBL(-, 1, e9c23179c2893, -, 55) },
+        { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+        { HEX_DBL(+, 1, 0b5586cf9890f, +, 0),
+          HEX_DBL(+, 1, 8a62e4adc610b, -, 54) },
+        { HEX_DBL(+, 1, 172b83c7d517b, +, 0),
+          HEX_DBL(-, 1, 19041b9d78a76, -, 55) },
+        { HEX_DBL(+, 1, 2387a6e756238, +, 0),
+          HEX_DBL(+, 1, 9b07eb6c70573, -, 54) },
+        { HEX_DBL(+, 1, 306fe0a31b715, +, 0),
+          HEX_DBL(+, 1, 6f46ad23182e4, -, 55) },
+        { HEX_DBL(+, 1, 3dea64c123422, +, 0),
+          HEX_DBL(+, 1, ada0911f09ebc, -, 55) },
+        { HEX_DBL(+, 1, 4bfdad5362a27, +, 0),
+          HEX_DBL(+, 1, d4397afec42e2, -, 56) },
+        { HEX_DBL(+, 1, 5ab07dd485429, +, 0),
+          HEX_DBL(+, 1, 6324c054647ad, -, 54) },
+        { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0),
+          HEX_DBL(-, 1, bdd3413b26456, -, 54) }
     };
-    int index = (int) rint( x.hi * 16.0 );
-    x.hi -= (double) index * 0.0625;
+    int index = (int)rint(x.hi * 16.0);
+    x.hi -= (double)index * 0.0625;
 
     // canonicalize x
     double temp = x.hi;
     x.hi += x.lo;
     x.lo -= x.hi - temp;
 
-    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max Error: 2 * 0x1.e112p-87
-    const double_double c[] = {
-        {HEX_DBL( +, 1, 62e42fefa39ef, -,  1 ), HEX_DBL( +, 1, abc9e3ac1d244, -, 56 )},
-        {HEX_DBL( +, 1, ebfbdff82c58f, -,  3 ), HEX_DBL( -, 1, 5e4987a631846, -, 57 )},
-        {HEX_DBL( +, 1, c6b08d704a0c,  -,  5 ), HEX_DBL( -, 1, d323200a05713, -, 59 )},
-        {HEX_DBL( +, 1, 3b2ab6fba4e7a, -,  7 ), HEX_DBL( +, 1, c5ee8f8b9f0c1, -, 63 )},
-        {HEX_DBL( +, 1, 5d87fe78a672a, -, 10 ), HEX_DBL( +, 1, 884e5e5cc7ecc, -, 64 )},
-        {HEX_DBL( +, 1, 430912f7e8373, -, 13 ), HEX_DBL( +, 1, 4f1b59514a326, -, 67 )},
-        {HEX_DBL( +, 1, ffcbfc5985e71, -, 17 ), HEX_DBL( -, 1, db7d6a0953b78, -, 71 )},
-        {HEX_DBL( +, 1, 62c150eb16465, -, 20 ), HEX_DBL( +, 1, e0767c2d7abf5, -, 80 )},
-        {HEX_DBL( +, 1, b52502b5e953,  -, 24 ), HEX_DBL( +, 1, 6797523f944bc, -, 78 )}
-    };
-    size_t count = sizeof( c ) / sizeof( c[0] );
+    // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32].  Max
+    // Error: 2 * 0x1.e112p-87
+    const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1),
+                                  HEX_DBL(+, 1, abc9e3ac1d244, -, 56) },
+                                { HEX_DBL(+, 1, ebfbdff82c58f, -, 3),
+                                  HEX_DBL(-, 1, 5e4987a631846, -, 57) },
+                                { HEX_DBL(+, 1, c6b08d704a0c, -, 5),
+                                  HEX_DBL(-, 1, d323200a05713, -, 59) },
+                                { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7),
+                                  HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) },
+                                { HEX_DBL(+, 1, 5d87fe78a672a, -, 10),
+                                  HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) },
+                                { HEX_DBL(+, 1, 430912f7e8373, -, 13),
+                                  HEX_DBL(+, 1, 4f1b59514a326, -, 67) },
+                                { HEX_DBL(+, 1, ffcbfc5985e71, -, 17),
+                                  HEX_DBL(-, 1, db7d6a0953b78, -, 71) },
+                                { HEX_DBL(+, 1, 62c150eb16465, -, 20),
+                                  HEX_DBL(+, 1, e0767c2d7abf5, -, 80) },
+                                { HEX_DBL(+, 1, b52502b5e953, -, 24),
+                                  HEX_DBL(+, 1, 6797523f944bc, -, 78) } };
+    size_t count = sizeof(c) / sizeof(c[0]);
 
     // Do polynomial
-    double_double r = c[count-1];
-    for( j = (int) count-2; j >= 0; j-- )
-        r = add_dd( c[j], mul_dd( r, x ) );
+    double_double r = c[count - 1];
+    for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x));
 
     // unwind approximation
-    r = mul_dd( r, x );     // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
+    r = mul_dd(r, x); // before: r =(exp2(x)-1)/x;   after: r = exp2(x) - 1
 
     // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above
     //  exp2(x) = (r + 1) * correction = r * correction + correction
-    r = mul_dd( r, corrections[index+8] );
-    r = add_dd( r, corrections[index+8] );
+    r = mul_dd(r, corrections[index + 8]);
+    r = add_dd(r, corrections[index + 8]);
 
-// Format result for output:
+    // Format result for output:
 
     // Get mantissa
-    long double m = ((long double) r.hi + (long double) r.lo );
+    long double m = ((long double)r.hi + (long double)r.lo);
 
     // Handle a pesky overflow cases when long double = double
-    if( i > 512 )
+    if (i > 512)
     {
-        m *= HEX_DBL( +, 1, 0, +, 512 );
+        m *= HEX_DBL(+, 1, 0, +, 512);
         i -= 512;
     }
-    else if( i < -512 )
+    else if (i < -512)
     {
-        m *= HEX_DBL( +, 1, 0, -, 512 );
+        m *= HEX_DBL(+, 1, 0, -, 512);
         i += 512;
     }
 
-    return m * ldexpl( 1.0L, i );
+    return m * ldexpl(1.0L, i);
 }
 
-long double reference_expm1l(  long double x)
+long double reference_expm1l(long double x)
 {
-#if defined( _MSC_VER ) && ! defined( __INTEL_COMPILER )
-    //unimplemented
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // unimplemented
     return x;
 #else
-    union { double f; cl_ulong u;} u;
-    u.f = (double) x;
+    union {
+        double f;
+        cl_ulong u;
+    } u;
+    u.f = (double)x;
 
-    if (reference_isnanl(x))
-        return x;
+    if (reference_isnanl(x)) return x;
 
-    if ( x > 710 )
-        return INFINITY;
+    if (x > 710) return INFINITY;
 
     long double y = expm1l(x);
 
     // Range of expm1l is -1.0L to +inf. Negative inf
     // on a few Linux platforms is clearly the wrong sign.
-    if (reference_isinfl(y))
-        y = INFINITY;
+    if (reference_isinfl(y)) y = INFINITY;
 
     return y;
 #endif
 }
 
-long double reference_fmaxl( long double x, long double y )
+long double reference_fmaxl(long double x, long double y)
 {
-    if( isnan(y) )
-        return x;
+    if (isnan(y)) return x;
 
     return x >= y ? x : y;
 }
 
-long double reference_fminl( long double x, long double y )
+long double reference_fminl(long double x, long double y)
 {
-    if( isnan(y) )
-        return x;
+    if (isnan(y)) return x;
 
     return x <= y ? x : y;
 }
 
-long double reference_hypotl( long double x, long double y )
+long double reference_hypotl(long double x, long double y)
 {
-  static const double tobig = HEX_DBL( +, 1, 0, +, 511 );
-  static const double big = HEX_DBL( +, 1, 0, +, 513 );
-  static const double rbig = HEX_DBL( +, 1, 0, -, 513 );
-  static const double tosmall = HEX_DBL( +, 1, 0, -, 511 );
-  static const double smalll = HEX_DBL( +, 1, 0, -, 607 );
-  static const double rsmall = HEX_DBL( +, 1, 0, +, 607 );
+    static const double tobig = HEX_DBL(+, 1, 0, +, 511);
+    static const double big = HEX_DBL(+, 1, 0, +, 513);
+    static const double rbig = HEX_DBL(+, 1, 0, -, 513);
+    static const double tosmall = HEX_DBL(+, 1, 0, -, 511);
+    static const double smalll = HEX_DBL(+, 1, 0, -, 607);
+    static const double rsmall = HEX_DBL(+, 1, 0, +, 607);
 
     long double max, min;
 
-    if( isinf(x) || isinf(y) )
-        return INFINITY;
+    if (isinf(x) || isinf(y)) return INFINITY;
 
-    if( isnan(x) || isnan(y) )
-        return x + y;
+    if (isnan(x) || isnan(y)) return x + y;
 
     x = reference_fabsl(x);
     y = reference_fabsl(y);
 
-    max = reference_fmaxl( x, y );
-    min = reference_fminl( x, y );
+    max = reference_fmaxl(x, y);
+    min = reference_fminl(x, y);
 
-  if( max > tobig )
+    if (max > tobig)
     {
         max *= rbig;
         min *= rbig;
-        return big * sqrtl( max * max + min * min );
+        return big * sqrtl(max * max + min * min);
     }
 
-  if( max < tosmall )
+    if (max < tosmall)
     {
         max *= rsmall;
         min *= rsmall;
-      return smalll * sqrtl( max * max + min * min );
+        return smalll * sqrtl(max * max + min * min);
     }
-    return sqrtl( x * x + y * y );
+    return sqrtl(x * x + y * y);
 }
 
-//long double reference_log2l( long double x )
+// long double reference_log2l( long double x )
 //{
 //    return log( x ) * 1.44269504088896340735992468100189214L;
 //}
 
-long double reference_log2l( long double x )
+long double reference_log2l(long double x)
 {
-    if( isnan(x) || x < 0.0 || x == -INFINITY)
-        return NAN;
+    if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN;
 
-    if( x == 0.0f)
-        return -INFINITY;
+    if (x == 0.0f) return -INFINITY;
 
-    if( x == INFINITY )
-        return INFINITY;
+    if (x == INFINITY) return INFINITY;
 
     double hi, lo;
-    __log2_ep( &hi, &lo, x);
+    __log2_ep(&hi, &lo, x);
 
-    return (long double) hi + (long double) lo;
+    return (long double)hi + (long double)lo;
 }
 
-long double reference_log1pl(  long double x)
+long double reference_log1pl(long double x)
 {
-#if defined( _MSC_VER ) && ! defined( __INTEL_COMPILER )
-    //unimplemented
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+    // unimplemented
     return x;
 #elif defined(__PPC__)
     // log1pl on PPC inadvertantly returns NaN for very large values. Work
@@ -3434,23 +3878,24 @@ long double reference_log1pl(  long double x)
 #endif
 }
 
-long double reference_logbl( long double x )
+long double reference_logbl(long double x)
 {
     // Since we are just using this to verify double precision, we can
     // use the double precision copysign here
-    union { double f; cl_ulong u;} u;
-    u.f = (double) x;
+    union {
+        double f;
+        cl_ulong u;
+    } u;
+    u.f = (double)x;
 
     cl_int exponent = (cl_uint)(u.u >> 52) & 0x7ff;
-    if( exponent == 0x7ff )
-        return x * x;
+    if (exponent == 0x7ff) return x * x;
 
-    if( exponent == 0 )
-    {   // deal with denormals
-        u.f =  x * HEX_DBL( +, 1, 0, +, 64 );
+    if (exponent == 0)
+    { // deal with denormals
+        u.f = x * HEX_DBL(+, 1, 0, +, 64);
         exponent = (cl_int)(u.u >> 52) & 0x7ff;
-        if( exponent == 0 )
-            return -INFINITY;
+        if (exponent == 0) return -INFINITY;
 
         return exponent - (1023 + 64);
     }
@@ -3458,84 +3903,84 @@ long double reference_logbl( long double x )
     return exponent - 1023;
 }
 
-long double reference_maxmagl( long double x, long double y )
+long double reference_maxmagl(long double x, long double y)
 {
     long double fabsx = fabsl(x);
     long double fabsy = fabsl(y);
 
-    if( fabsx < fabsy )
-        return y;
+    if (fabsx < fabsy) return y;
 
-    if( fabsy < fabsx )
-        return x;
+    if (fabsy < fabsx) return x;
 
     return reference_fmaxl(x, y);
 }
 
-long double reference_minmagl( long double x, long double y )
+long double reference_minmagl(long double x, long double y)
 {
     long double fabsx = fabsl(x);
     long double fabsy = fabsl(y);
 
-    if( fabsx > fabsy )
-        return y;
+    if (fabsx > fabsy) return y;
 
-    if( fabsy > fabsx )
-        return x;
+    if (fabsy > fabsx) return x;
 
     return reference_fminl(x, y);
 }
 
-long double reference_nanl( cl_ulong x )
+long double reference_nanl(cl_ulong x)
 {
-    union{ cl_ulong u; cl_double f; }u;
+    union {
+        cl_ulong u;
+        cl_double f;
+    } u;
     u.u = x | 0x7ff8000000000000ULL;
-    return (long double) u.f;
+    return (long double)u.f;
 }
 
 
-long double reference_reciprocall( long double x )
-{
-    return 1.0L / x;
-}
+long double reference_reciprocall(long double x) { return 1.0L / x; }
 
-long double reference_remainderl( long double x, long double y );
-long double reference_remainderl( long double x, long double y )
+long double reference_remainderl(long double x, long double y);
+long double reference_remainderl(long double x, long double y)
 {
     int i;
-    return reference_remquol( x, y, &i );
+    return reference_remquol(x, y, &i);
 }
 
-long double reference_lgammal( long double x);
-long double reference_lgammal( long double x)
+long double reference_lgammal(long double x);
+long double reference_lgammal(long double x)
 {
     // lgamma is currently not tested
-    return reference_lgamma( x );
+    return reference_lgamma(x);
 }
 
-static uint32_t two_over_pi[] = { 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5, 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9,
-0x6e3a424, 0x374b8019, 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121, 0xe99c702, 0x1ad17df9,
-0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4, 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9,
-0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7, 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c,
-0x3046fc7b, 0x1aeafc33, 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102, 0xdffd880, 0x135cc9cc,
-0x10606155
+static uint32_t two_over_pi[] = {
+    0x0,        0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5,
+    0x1993c439, 0x107f945,  0x23abdebb, 0x31586dc9, 0x6e3a424,  0x374b8019,
+    0x92eea09,  0x3464873f, 0x21deb1cb, 0x4a69cfb,  0x288235f5, 0xbaed121,
+    0xe99c702,  0x1ad17df9, 0x13991d6,  0xe60d4ce,  0x1f49c845, 0x3e2ef7e4,
+    0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d,  0xd9fb3c9,
+    0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7,
+    0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33,
+    0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102,
+    0xdffd880,  0x135cc9cc, 0x10606155
 };
 
-static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846, 0x26263314, 0x1701b839, 0x28948127 };
+static uint32_t pi_over_two[] = { 0x1,        0x2487ed51, 0x42d1846,
+                                  0x26263314, 0x1701b839, 0x28948127 };
 
-typedef union
-    {
-        uint64_t u;
-        double   d;
-    }d_ui64_t;
+typedef union {
+    uint64_t u;
+    double d;
+} d_ui64_t;
 
 // radix or base of representation
 #define RADIX (30)
 #define DIGITS 6
 
-d_ui64_t two_pow_pradix = { (uint64_t) (1023 + RADIX) << 52 };
-d_ui64_t two_pow_mradix = { (uint64_t) (1023 - RADIX) << 52 };
-d_ui64_t two_pow_two_mradix = { (uint64_t) (1023-2*RADIX) << 52 };
+d_ui64_t two_pow_pradix = { (uint64_t)(1023 + RADIX) << 52 };
+d_ui64_t two_pow_mradix = { (uint64_t)(1023 - RADIX) << 52 };
+d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 };
 
 #define tp_pradix two_pow_pradix.d
 #define tp_mradix two_pow_mradix.d
@@ -3544,11 +3989,12 @@ d_ui64_t two_pow_two_mradix = { (uint64_t) (1023-2*RADIX) << 52 };
 // floating point number.
 // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ]
 typedef struct
-    {
-        uint32_t X[3];        // three 32 bit integers are sufficient to represnt double in base_30
-        int index;            // exponent bias
-        int sign;            // sign of double
-    }eprep_t;
+{
+    uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in
+                   // base_30
+    int index; // exponent bias
+    int sign; // sign of double
+} eprep_t;
 
 static eprep_t double_to_eprep(double x);
 
@@ -3556,15 +4002,17 @@ static eprep_t double_to_eprep(double x)
 {
     eprep_t result;
 
-    result.sign = (signbit( x ) == 0) ? 1 : -1;
-    x = fabs( x );
+    result.sign = (signbit(x) == 0) ? 1 : -1;
+    x = fabs(x);
 
     int index = 0;
-    while( x > tp_pradix ) {
+    while (x > tp_pradix)
+    {
         index++;
         x *= tp_mradix;
     }
-    while( x < 1 ) {
+    while (x < 1)
+    {
         index--;
         x *= tp_pradix;
     }
@@ -3572,9 +4020,10 @@ static eprep_t double_to_eprep(double x)
     result.index = index;
     int i = 0;
     result.X[0] = result.X[1] = result.X[2] = 0;
-    while( x != 0.0 ) {
-        result.X[i] = (uint32_t) x;
-        x = (x - (double) result.X[i]) * tp_pradix;
+    while (x != 0.0)
+    {
+        result.X[i] = (uint32_t)x;
+        x = (x - (double)result.X[i]) * tp_pradix;
         i++;
     }
     return result;
@@ -3660,102 +4109,120 @@ static eprep_t double_to_eprep(double x)
  return sgn*res;
  }
  */
-static double eprep_to_double( eprep_t epx );
+static double eprep_to_double(eprep_t epx);
 
-static double eprep_to_double( eprep_t epx )
+static double eprep_to_double(eprep_t epx)
 {
     double res = 0.0;
 
-    res += ldexp((double) epx.X[0], (epx.index - 0)*RADIX);
-    res += ldexp((double) epx.X[1], (epx.index - 1)*RADIX);
-    res += ldexp((double) epx.X[2], (epx.index - 2)*RADIX);
+    res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX);
+    res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX);
+    res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX);
 
     return copysign(res, epx.sign);
 }
 
-static int payne_hanek( double *y, int *exception );
+static int payne_hanek(double *y, int *exception);
 
-static int payne_hanek( double *y, int *exception )
+static int payne_hanek(double *y, int *exception)
 {
     double x = *y;
 
     // exception cases .. no reduction required
-    if( isnan( x ) || isinf( x ) || (fabs( x ) <= M_PI_4) ) {
+    if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4))
+    {
         *exception = 1;
         return 0;
     }
 
     *exception = 0;
 
-    // After computation result[0] contains integer part while result[1]....result[DIGITS-1]
-    // contain fractional part. So we are doing computation with (DIGITS-1)*RADIX precision.
-    // Default DIGITS=6 and RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm
-    // shows that a double precision x, closest to pi/2 is 6381956970095103 x 2^797 which can
-    // cause 61 digits of cancellation in computation of f = x*2/pi - floor(x*2/pi) ... thus we need
-    // at least 114 bits (61 leading zeros + 53 bits of mentissa of f) of precision to accurately compute
-    // f in double precision. Since we are using 150 bits (still an overkill), we should be safe. Extra
-    // bits can act as guard bits for correct rounding.
-    uint64_t result[DIGITS+2];
+    // After computation result[0] contains integer part while
+    // result[1]....result[DIGITS-1] contain fractional part. So we are doing
+    // computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 and
+    // RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm shows
+    // that a double precision x, closest to pi/2 is 6381956970095103 x 2^797
+    // which can cause 61 digits of cancellation in computation of f = x*2/pi -
+    // floor(x*2/pi) ... thus we need at least 114 bits (61 leading zeros + 53
+    // bits of mentissa of f) of precision to accurately compute f in double
+    // precision. Since we are using 150 bits (still an overkill), we should be
+    // safe. Extra bits can act as guard bits for correct rounding.
+    uint64_t result[DIGITS + 2];
 
     // compute extended precision representation of x
-    eprep_t epx = double_to_eprep( x );
+    eprep_t epx = double_to_eprep(x);
     int index = epx.index;
     int i, j;
-    // extended precision multiplication of 2/pi*x .... we will loose at max two RADIX=30 bit digits in
-    // the worst case
-    for(i = 0; i < (DIGITS+2); i++) {
+    // extended precision multiplication of 2/pi*x .... we will loose at max two
+    // RADIX=30 bit digits in the worst case
+    for (i = 0; i < (DIGITS + 2); i++)
+    {
         result[i] = 0;
-        result[i] += ((index + i - 0) >= 0) ? ((uint64_t) two_over_pi[index + i - 0] * (uint64_t) epx.X[0]) : 0;
-        result[i] += ((index + i - 1) >= 0) ? ((uint64_t) two_over_pi[index + i - 1] * (uint64_t) epx.X[1]) : 0;
-        result[i] += ((index + i - 2) >= 0) ? ((uint64_t) two_over_pi[index + i - 2] * (uint64_t) epx.X[2]) : 0;
+        result[i] += ((index + i - 0) >= 0)
+            ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0])
+            : 0;
+        result[i] += ((index + i - 1) >= 0)
+            ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1])
+            : 0;
+        result[i] += ((index + i - 2) >= 0)
+            ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2])
+            : 0;
     }
 
     // Carry propagation.
     uint64_t tmp;
-    for(i = DIGITS+2-1; i > 0; i--) {
+    for (i = DIGITS + 2 - 1; i > 0; i--)
+    {
         tmp = result[i] >> RADIX;
         result[i - 1] += tmp;
         result[i] -= (tmp << RADIX);
     }
 
-    // we dont ned to normalize the integer part since only last two bits of this will be used
-    // subsequently algorithm which remain unaltered by this normalization.
-    // tmp = result[0] >> RADIX;
-    // result[0] -= (tmp << RADIX);
-    unsigned int N = (unsigned int) result[0];
+    // we dont ned to normalize the integer part since only last two bits of
+    // this will be used subsequently algorithm which remain unaltered by this
+    // normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << RADIX);
+    unsigned int N = (unsigned int)result[0];
 
-    // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that testing if the final
-    // x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to testing, at this stage, if r[1] (the first fractional
-    // digit) is greater than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to mentioned
-    // range is equivalent to substracting fractional part at this stage from one and changing the sign.
+    // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that
+    // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to
+    // testing, at this stage, if r[1] (the first fractional digit) is greater
+    // than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to
+    // mentioned range is equivalent to substracting fractional part at this
+    // stage from one and changing the sign.
     int sign = 1;
-    if(result[1] > (uint64_t)(1 << (RADIX - 1))) {
-        for(i = 1; i < (DIGITS + 2); i++)
+    if (result[1] > (uint64_t)(1 << (RADIX - 1)))
+    {
+        for (i = 1; i < (DIGITS + 2); i++)
             result[i] = (~((unsigned int)result[i]) & 0x3fffffff);
         N += 1;
         sign = -1;
     }
 
-    // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in the worst case
-    // (when x is multiple of 2/pi very close to an integer) so we need to get rid of these zeros
-    // and adjust the index of final result. So in the worst case, precision of comupted result is
-    // 90 bits (150 bits original bits - 60 lost in cancellation).
+    // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in
+    // the worst case (when x is multiple of 2/pi very close to an integer) so
+    // we need to get rid of these zeros and adjust the index of final result.
+    // So in the worst case, precision of comupted result is 90 bits (150 bits
+    // original bits - 60 lost in cancellation).
     int ind = 1;
-    for(i = 1; i < (DIGITS+2); i++) {
-        if(result[i] != 0)
+    for (i = 1; i < (DIGITS + 2); i++)
+    {
+        if (result[i] != 0)
             break;
         else
             ind++;
     }
 
-    uint64_t r[DIGITS-1];
-    for(i = 0; i < (DIGITS-1); i++) {
+    uint64_t r[DIGITS - 1];
+    for (i = 0; i < (DIGITS - 1); i++)
+    {
         r[i] = 0;
-        for(j = 0; j <= i; j++) {
-            r[i] += (result[ind+i-j] * (uint64_t) pi_over_two[j]);
+        for (j = 0; j <= i; j++)
+        {
+            r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]);
         }
     }
-    for(i = (DIGITS-2); i > 0; i--) {
+    for (i = (DIGITS - 2); i > 0; i--)
+    {
         tmp = r[i] >> RADIX;
         r[i - 1] += tmp;
         r[i] -= (tmp << RADIX);
@@ -3764,147 +4231,127 @@ static int payne_hanek( double *y, int *exception )
     r[0] -= (tmp << RADIX);
 
     eprep_t epr;
-    epr.sign = epx.sign*sign;
-    if(tmp != 0) {
+    epr.sign = epx.sign * sign;
+    if (tmp != 0)
+    {
         epr.index = -ind + 1;
-        epr.X[0] = (uint32_t) tmp;
-        epr.X[1] = (uint32_t) r[0];
-        epr.X[2] = (uint32_t) r[1];
+        epr.X[0] = (uint32_t)tmp;
+        epr.X[1] = (uint32_t)r[0];
+        epr.X[2] = (uint32_t)r[1];
     }
-    else {
+    else
+    {
         epr.index = -ind;
-        epr.X[0] = (uint32_t) r[0];
-        epr.X[1] = (uint32_t) r[1];
-        epr.X[2] = (uint32_t) r[2];
+        epr.X[0] = (uint32_t)r[0];
+        epr.X[1] = (uint32_t)r[1];
+        epr.X[2] = (uint32_t)r[2];
     }
 
-    *y = eprep_to_double( epr );
-    return epx.sign*N;
+    *y = eprep_to_double(epr);
+    return epx.sign * N;
 }
 
 double reference_relaxed_cos(double x)
 {
-  if(isnan(x))
-    return NAN;
-  return (float)cos((float)x);
+    if (isnan(x)) return NAN;
+    return (float)cos((float)x);
 }
 
 double reference_cos(double x)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return cos( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return cos(x);
     unsigned int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  cos( x );
-        case 1:
-            return -sin( x );
-        case 2:
-            return -cos( x );
-        case 3:
-            return  sin( x );
+    switch (c)
+    {
+        case 0: return cos(x);
+        case 1: return -sin(x);
+        case 2: return -cos(x);
+        case 3: return sin(x);
     }
     return 0.0;
 }
 
-double reference_relaxed_sin(double x){
-  return (float)sin((float)x);
-}
+double reference_relaxed_sin(double x) { return (float)sin((float)x); }
 
 double reference_sin(double x)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return sin( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return sin(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  sin( x );
-        case 1:
-            return  cos( x );
-        case 2:
-            return -sin( x );
-        case 3:
-            return -cos( x );
+    switch (c)
+    {
+        case 0: return sin(x);
+        case 1: return cos(x);
+        case 2: return -sin(x);
+        case 3: return -cos(x);
     }
     return 0.0;
 }
 
-double reference_relaxed_sincos(double x, double * y){
-  *y = reference_relaxed_cos(x);
-  return reference_relaxed_sin(x);
+double reference_relaxed_sincos(double x, double *y)
+{
+    *y = reference_relaxed_cos(x);
+    return reference_relaxed_sin(x);
 }
 
 double reference_sincos(double x, double *y)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception ) {
-        *y = cos( x );
-        return sin( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception)
+    {
+        *y = cos(x);
+        return sin(x);
     }
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            *y = cos( x );
-            return  sin( x );
-        case 1:
-            *y = -sin( x );
-            return  cos( x );
-        case 2:
-            *y = -cos( x );
-            return -sin( x );
-        case 3:
-            *y = sin( x );
-            return -cos( x );
+    switch (c)
+    {
+        case 0: *y = cos(x); return sin(x);
+        case 1: *y = -sin(x); return cos(x);
+        case 2: *y = -cos(x); return -sin(x);
+        case 3: *y = sin(x); return -cos(x);
     }
     return 0.0;
 }
 
-double reference_relaxed_tan(double x){
-  return ((float) reference_relaxed_sin((float)x))/((float) reference_relaxed_cos((float)x));
+double reference_relaxed_tan(double x)
+{
+    return ((float)reference_relaxed_sin((float)x))
+        / ((float)reference_relaxed_cos((float)x));
 }
 
 double reference_tan(double x)
 {
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return tan( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return tan(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  tan( x );
-        case 1:
-            return -1.0 / tan( x );
-        case 2:
-            return tan( x );
-        case 3:
-            return -1.0 / tan( x );
+    switch (c)
+    {
+        case 0: return tan(x);
+        case 1: return -1.0 / tan(x);
+        case 2: return tan(x);
+        case 3: return -1.0 / tan(x);
     }
     return 0.0;
 }
 
 long double reference_cosl(long double xx)
 {
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return cosl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return cosl(x);
     unsigned int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  cosl( x );
-        case 1:
-            return -sinl( x );
-        case 2:
-            return -cosl( x );
-        case 3:
-            return  sinl( x );
+    switch (c)
+    {
+        case 0: return cosl(x);
+        case 1: return -sinl(x);
+        case 2: return -cosl(x);
+        case 3: return sinl(x);
     }
     return 0.0;
 }
@@ -3913,25 +4360,20 @@ long double reference_sinl(long double xx)
 {
     // we use system tanl after reduction which
     // can flush denorm input to zero so
-    //take care of it here.
-    if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 ))
-        return xx;
+    // take care of it here.
+    if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
 
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return sinl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return sinl(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  sinl( x );
-        case 1:
-            return  cosl( x );
-        case 2:
-            return -sinl( x );
-        case 3:
-            return -cosl( x );
+    switch (c)
+    {
+        case 0: return sinl(x);
+        case 1: return cosl(x);
+        case 2: return -sinl(x);
+        case 3: return -cosl(x);
     }
     return 0.0;
 }
@@ -3940,34 +4382,28 @@ long double reference_sincosl(long double xx, long double *y)
 {
     // we use system tanl after reduction which
     // can flush denorm input to zero so
-    //take care of it here.
-    if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 ))
+    // take care of it here.
+    if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022))
     {
         *y = cosl(xx);
         return xx;
     }
 
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception ) {
-        *y = cosl( x );
-        return sinl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception)
+    {
+        *y = cosl(x);
+        return sinl(x);
     }
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            *y = cosl( x );
-            return  sinl( x );
-        case 1:
-            *y = -sinl( x );
-            return  cosl( x );
-        case 2:
-            *y = -cosl( x );
-            return -sinl( x );
-        case 3:
-            *y = sinl( x );
-            return -cosl( x );
+    switch (c)
+    {
+        case 0: *y = cosl(x); return sinl(x);
+        case 1: *y = -sinl(x); return cosl(x);
+        case 2: *y = -cosl(x); return -sinl(x);
+        case 3: *y = sinl(x); return -cosl(x);
     }
     return 0.0;
 }
@@ -3976,205 +4412,337 @@ long double reference_tanl(long double xx)
 {
     // we use system tanl after reduction which
     // can flush denorm input to zero so
-    //take care of it here.
-    if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 ))
-        return xx;
+    // take care of it here.
+    if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx;
 
-    double x = (double) xx;
+    double x = (double)xx;
     int exception;
-    int N = payne_hanek( &x, &exception );
-    if( exception )
-        return tanl( x );
+    int N = payne_hanek(&x, &exception);
+    if (exception) return tanl(x);
     int c = N & 3;
-    switch ( c ) {
-        case 0:
-            return  tanl( x );
-        case 1:
-            return -1.0 / tanl( x );
-        case 2:
-            return tanl( x );
-        case 3:
-            return -1.0 / tanl( x );
+    switch (c)
+    {
+        case 0: return tanl(x);
+        case 1: return -1.0 / tanl(x);
+        case 2: return tanl(x);
+        case 3: return -1.0 / tanl(x);
     }
     return 0.0;
 }
 
 static double __loglTable1[64][3] = {
-{HEX_DBL( +, 1, 5390948f40fea, +, 0 ), HEX_DBL( -, 1, a152f142a,  -, 2 ), HEX_DBL( +, 1, f93e27b43bd2c, -, 40 )},
-{HEX_DBL( +, 1, 5015015015015, +, 0 ), HEX_DBL( -, 1, 921800925,  -, 2 ), HEX_DBL( +, 1, 162432a1b8df7, -, 41 )},
-{HEX_DBL( +, 1, 4cab88725af6e, +, 0 ), HEX_DBL( -, 1, 8304d90c18, -, 2 ), HEX_DBL( +, 1, 80bb749056fe7, -, 40 )},
-{HEX_DBL( +, 1, 49539e3b2d066, +, 0 ), HEX_DBL( -, 1, 7418acebc,  -, 2 ), HEX_DBL( +, 1, ceac7f0607711, -, 43 )},
-{HEX_DBL( +, 1, 460cbc7f5cf9a, +, 0 ), HEX_DBL( -, 1, 6552b49988, -, 2 ), HEX_DBL( +, 1, d8913d0e89fa,  -, 42 )},
-{HEX_DBL( +, 1, 42d6625d51f86, +, 0 ), HEX_DBL( -, 1, 56b22e6b58, -, 2 ), HEX_DBL( +, 1, c7eaf515033a1, -, 44 )},
-{HEX_DBL( +, 1, 3fb013fb013fb, +, 0 ), HEX_DBL( -, 1, 48365e696,  -, 2 ), HEX_DBL( +, 1, 434adcde7edc7, -, 41 )},
-{HEX_DBL( +, 1, 3c995a47babe7, +, 0 ), HEX_DBL( -, 1, 39de8e156,  -, 2 ), HEX_DBL( +, 1, 8246f8e527754, -, 40 )},
-{HEX_DBL( +, 1, 3991c2c187f63, +, 0 ), HEX_DBL( -, 1, 2baa0c34c,  -, 2 ), HEX_DBL( +, 1, e1513c28e180d, -, 42 )},
-{HEX_DBL( +, 1, 3698df3de0747, +, 0 ), HEX_DBL( -, 1, 1d982c9d58, -, 2 ), HEX_DBL( +, 1, 63ea3fed4b8a2, -, 40 )},
-{HEX_DBL( +, 1, 33ae45b57bcb1, +, 0 ), HEX_DBL( -, 1, 0fa848045,  -, 2 ), HEX_DBL( +, 1, 32ccbacf1779b, -, 40 )},
-{HEX_DBL( +, 1, 30d190130d19,  +, 0 ), HEX_DBL( -, 1, 01d9bbcfa8, -, 2 ), HEX_DBL( +, 1, e2bfeb2b884aa, -, 42 )},
-{HEX_DBL( +, 1, 2e025c04b8097, +, 0 ), HEX_DBL( -, 1, e857d3d37,  -, 3 ), HEX_DBL( +, 1, d9309b4d2ea85, -, 40 )},
-{HEX_DBL( +, 1, 2b404ad012b4,  +, 0 ), HEX_DBL( -, 1, cd3c712d4,  -, 3 ), HEX_DBL( +, 1, ddf360962d7ab, -, 40 )},
-{HEX_DBL( +, 1, 288b01288b012, +, 0 ), HEX_DBL( -, 1, b2602497e,  -, 3 ), HEX_DBL( +, 1, 597f8a121640f, -, 40 )},
-{HEX_DBL( +, 1, 25e22708092f1, +, 0 ), HEX_DBL( -, 1, 97c1cb13d,  -, 3 ), HEX_DBL( +, 1, 02807d15580dc, -, 40 )},
-{HEX_DBL( +, 1, 23456789abcdf, +, 0 ), HEX_DBL( -, 1, 7d60496d,   -, 3 ), HEX_DBL( +, 1, 12ce913d7a827, -, 41 )},
-{HEX_DBL( +, 1, 20b470c67c0d8, +, 0 ), HEX_DBL( -, 1, 633a8bf44,  -, 3 ), HEX_DBL( +, 1, 0648bca9c96bd, -, 40 )},
-{HEX_DBL( +, 1, 1e2ef3b3fb874, +, 0 ), HEX_DBL( -, 1, 494f863b9,  -, 3 ), HEX_DBL( +, 1, 066fceb89b0eb, -, 42 )},
-{HEX_DBL( +, 1, 1bb4a4046ed29, +, 0 ), HEX_DBL( -, 1, 2f9e32d5c,  -, 3 ), HEX_DBL( +, 1, 17b8b6c4f846b, -, 46 )},
-{HEX_DBL( +, 1, 19453808ca29c, +, 0 ), HEX_DBL( -, 1, 162593187,  -, 3 ), HEX_DBL( +, 1, 2c83506452154, -, 42 )},
-{HEX_DBL( +, 1, 16e0689427378, +, 0 ), HEX_DBL( -, 1, f9c95dc1e,  -, 4 ), HEX_DBL( +, 1, dd5d2183150f3, -, 41 )},
-{HEX_DBL( +, 1, 1485f0e0acd3b, +, 0 ), HEX_DBL( -, 1, c7b528b72,  -, 4 ), HEX_DBL( +, 1, 0e43c4f4e619d, -, 40 )},
-{HEX_DBL( +, 1, 12358e75d3033, +, 0 ), HEX_DBL( -, 1, 960caf9ac,  -, 4 ), HEX_DBL( +, 1, 20fbfd5902a1e, -, 42 )},
-{HEX_DBL( +, 1, 0fef010fef01,  +, 0 ), HEX_DBL( -, 1, 64ce26c08,  -, 4 ), HEX_DBL( +, 1, 8ebeefb4ac467, -, 40 )},
-{HEX_DBL( +, 1, 0db20a88f4695, +, 0 ), HEX_DBL( -, 1, 33f7cde16,  -, 4 ), HEX_DBL( +, 1, 30b3312da7a7d, -, 40 )},
-{HEX_DBL( +, 1, 0b7e6ec259dc7, +, 0 ), HEX_DBL( -, 1, 0387efbcc,  -, 4 ), HEX_DBL( +, 1, 796f1632949c3, -, 40 )},
-{HEX_DBL( +, 1, 0953f39010953, +, 0 ), HEX_DBL( -, 1, a6f9c378,   -, 5 ), HEX_DBL( +, 1, 1687e151172cc, -, 40 )},
-{HEX_DBL( +, 1, 073260a47f7c6, +, 0 ), HEX_DBL( -, 1, 47aa07358,  -, 5 ), HEX_DBL( +, 1, 1f87e4a9cc778, -, 42 )},
-{HEX_DBL( +, 1, 05197f7d73404, +, 0 ), HEX_DBL( -, 1, d23afc498,  -, 6 ), HEX_DBL( +, 1, b183a6b628487, -, 40 )},
-{HEX_DBL( +, 1, 03091b51f5e1a, +, 0 ), HEX_DBL( -, 1, 16a21e21,   -, 6 ), HEX_DBL( +, 1, 7d75c58973ce5, -, 40 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,          +, 0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,          +, 0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, f44659e4a4271, -, 1 ), HEX_DBL( +, 1, 11cd1d51,   -, 5 ), HEX_DBL( +, 1, 9a0d857e2f4b2, -, 40 )},
-{HEX_DBL( +, 1, ecc07b301ecc,  -, 1 ), HEX_DBL( +, 1, c4dfab908,  -, 5 ), HEX_DBL( +, 1, 55b53fce557fd, -, 40 )},
-{HEX_DBL( +, 1, e573ac901e573, -, 1 ), HEX_DBL( +, 1, 3aa2fdd26,  -, 4 ), HEX_DBL( +, 1, f1cb0c9532089, -, 40 )},
-{HEX_DBL( +, 1, de5d6e3f8868a, -, 1 ), HEX_DBL( +, 1, 918a16e46,  -, 4 ), HEX_DBL( +, 1, 9af0dcd65a6e1, -, 43 )},
-{HEX_DBL( +, 1, d77b654b82c33, -, 1 ), HEX_DBL( +, 1, e72ec117e,  -, 4 ), HEX_DBL( +, 1, a5b93c4ebe124, -, 40 )},
-{HEX_DBL( +, 1, d0cb58f6ec074, -, 1 ), HEX_DBL( +, 1, 1dcd19755,  -, 3 ), HEX_DBL( +, 1, 5be50e71ddc6c, -, 42 )},
-{HEX_DBL( +, 1, ca4b3055ee191, -, 1 ), HEX_DBL( +, 1, 476a9f983,  -, 3 ), HEX_DBL( +, 1, ee9a798719e7f, -, 40 )},
-{HEX_DBL( +, 1, c3f8f01c3f8f,  -, 1 ), HEX_DBL( +, 1, 70742d4ef,  -, 3 ), HEX_DBL( +, 1, 3ff1352c1219c, -, 46 )},
-{HEX_DBL( +, 1, bdd2b899406f7, -, 1 ), HEX_DBL( +, 1, 98edd077e,  -, 3 ), HEX_DBL( +, 1, c383cd11362f4, -, 41 )},
-{HEX_DBL( +, 1, b7d6c3dda338b, -, 1 ), HEX_DBL( +, 1, c0db6cdd9,  -, 3 ), HEX_DBL( +, 1, 37bd85b1a824e, -, 41 )},
-{HEX_DBL( +, 1, b2036406c80d9, -, 1 ), HEX_DBL( +, 1, e840be74e,  -, 3 ), HEX_DBL( +, 1, a9334d525e1ec, -, 41 )},
-{HEX_DBL( +, 1, ac5701ac5701a, -, 1 ), HEX_DBL( +, 1, 0790adbb,   -, 2 ), HEX_DBL( +, 1, 8060bfb6a491,  -, 41 )},
-{HEX_DBL( +, 1, a6d01a6d01a6d, -, 1 ), HEX_DBL( +, 1, 1ac05b2918, -, 2 ), HEX_DBL( +, 1, c1c161471580a, -, 40 )},
-{HEX_DBL( +, 1, a16d3f97a4b01, -, 1 ), HEX_DBL( +, 1, 2db10fc4d8, -, 2 ), HEX_DBL( +, 1, ab1aa62214581, -, 42 )},
-{HEX_DBL( +, 1, 9c2d14ee4a101, -, 1 ), HEX_DBL( +, 1, 406463b1b,  -, 2 ), HEX_DBL( +, 1, 12e95dbda6611, -, 44 )},
-{HEX_DBL( +, 1, 970e4f80cb872, -, 1 ), HEX_DBL( +, 1, 52dbdfc4c8, -, 2 ), HEX_DBL( +, 1, 6b53fee511af,  -, 42 )},
-{HEX_DBL( +, 1, 920fb49d0e228, -, 1 ), HEX_DBL( +, 1, 6518fe467,  -, 2 ), HEX_DBL( +, 1, eea7d7d7d1764, -, 40 )},
-{HEX_DBL( +, 1, 8d3018d3018d3, -, 1 ), HEX_DBL( +, 1, 771d2ba7e8, -, 2 ), HEX_DBL( +, 1, ecefa8d4fab97, -, 40 )},
-{HEX_DBL( +, 1, 886e5f0abb049, -, 1 ), HEX_DBL( +, 1, 88e9c72e08, -, 2 ), HEX_DBL( +, 1, 913ea3d33fd14, -, 41 )},
-{HEX_DBL( +, 1, 83c977ab2bedd, -, 1 ), HEX_DBL( +, 1, 9a802391e,  -, 2 ), HEX_DBL( +, 1, 197e845877c94, -, 41 )},
-{HEX_DBL( +, 1, 7f405fd017f4,  -, 1 ), HEX_DBL( +, 1, abe18797f,  -, 2 ), HEX_DBL( +, 1, f4a52f8e8a81,  -, 42 )},
-{HEX_DBL( +, 1, 7ad2208e0ecc3, -, 1 ), HEX_DBL( +, 1, bd0f2e9e78, -, 2 ), HEX_DBL( +, 1, 031f4336644cc, -, 42 )},
-{HEX_DBL( +, 1, 767dce434a9b1, -, 1 ), HEX_DBL( +, 1, ce0a4923a,  -, 2 ), HEX_DBL( +, 1, 61f33c897020c, -, 40 )},
-{HEX_DBL( +, 1, 724287f46debc, -, 1 ), HEX_DBL( +, 1, ded3fd442,  -, 2 ), HEX_DBL( +, 1, b2632e830632,  -, 41 )},
-{HEX_DBL( +, 1, 6e1f76b4337c6, -, 1 ), HEX_DBL( +, 1, ef6d673288, -, 2 ), HEX_DBL( +, 1, 888ec245a0bf,  -, 40 )},
-{HEX_DBL( +, 1, 6a13cd153729,  -, 1 ), HEX_DBL( +, 1, ffd799a838, -, 2 ), HEX_DBL( +, 1, fe6f3b2f5fc8e, -, 40 )},
-{HEX_DBL( +, 1, 661ec6a5122f9, -, 1 ), HEX_DBL( +, 1, 0809cf27f4, -, 1 ), HEX_DBL( +, 1, 81eaa9ef284dd, -, 40 )},
-{HEX_DBL( +, 1, 623fa7701623f, -, 1 ), HEX_DBL( +, 1, 10113b153c, -, 1 ), HEX_DBL( +, 1, 1d7b07d6b1143, -, 42 )},
-{HEX_DBL( +, 1, 5e75bb8d015e7, -, 1 ), HEX_DBL( +, 1, 18028cf728, -, 1 ), HEX_DBL( +, 1, 76b100b1f6c6,  -, 41 )},
-{HEX_DBL( +, 1, 5ac056b015ac,  -, 1 ), HEX_DBL( +, 1, 1fde3d30e8, -, 1 ), HEX_DBL( +, 1, 26faeb9870945, -, 45 )},
-{HEX_DBL( +, 1, 571ed3c506b39, -, 1 ), HEX_DBL( +, 1, 27a4c0585c, -, 1 ), HEX_DBL( +, 1, 7f2c5344d762b, -, 42 )}
+    { HEX_DBL(+, 1, 5390948f40fea, +, 0), HEX_DBL(-, 1, a152f142a, -, 2),
+      HEX_DBL(+, 1, f93e27b43bd2c, -, 40) },
+    { HEX_DBL(+, 1, 5015015015015, +, 0), HEX_DBL(-, 1, 921800925, -, 2),
+      HEX_DBL(+, 1, 162432a1b8df7, -, 41) },
+    { HEX_DBL(+, 1, 4cab88725af6e, +, 0), HEX_DBL(-, 1, 8304d90c18, -, 2),
+      HEX_DBL(+, 1, 80bb749056fe7, -, 40) },
+    { HEX_DBL(+, 1, 49539e3b2d066, +, 0), HEX_DBL(-, 1, 7418acebc, -, 2),
+      HEX_DBL(+, 1, ceac7f0607711, -, 43) },
+    { HEX_DBL(+, 1, 460cbc7f5cf9a, +, 0), HEX_DBL(-, 1, 6552b49988, -, 2),
+      HEX_DBL(+, 1, d8913d0e89fa, -, 42) },
+    { HEX_DBL(+, 1, 42d6625d51f86, +, 0), HEX_DBL(-, 1, 56b22e6b58, -, 2),
+      HEX_DBL(+, 1, c7eaf515033a1, -, 44) },
+    { HEX_DBL(+, 1, 3fb013fb013fb, +, 0), HEX_DBL(-, 1, 48365e696, -, 2),
+      HEX_DBL(+, 1, 434adcde7edc7, -, 41) },
+    { HEX_DBL(+, 1, 3c995a47babe7, +, 0), HEX_DBL(-, 1, 39de8e156, -, 2),
+      HEX_DBL(+, 1, 8246f8e527754, -, 40) },
+    { HEX_DBL(+, 1, 3991c2c187f63, +, 0), HEX_DBL(-, 1, 2baa0c34c, -, 2),
+      HEX_DBL(+, 1, e1513c28e180d, -, 42) },
+    { HEX_DBL(+, 1, 3698df3de0747, +, 0), HEX_DBL(-, 1, 1d982c9d58, -, 2),
+      HEX_DBL(+, 1, 63ea3fed4b8a2, -, 40) },
+    { HEX_DBL(+, 1, 33ae45b57bcb1, +, 0), HEX_DBL(-, 1, 0fa848045, -, 2),
+      HEX_DBL(+, 1, 32ccbacf1779b, -, 40) },
+    { HEX_DBL(+, 1, 30d190130d19, +, 0), HEX_DBL(-, 1, 01d9bbcfa8, -, 2),
+      HEX_DBL(+, 1, e2bfeb2b884aa, -, 42) },
+    { HEX_DBL(+, 1, 2e025c04b8097, +, 0), HEX_DBL(-, 1, e857d3d37, -, 3),
+      HEX_DBL(+, 1, d9309b4d2ea85, -, 40) },
+    { HEX_DBL(+, 1, 2b404ad012b4, +, 0), HEX_DBL(-, 1, cd3c712d4, -, 3),
+      HEX_DBL(+, 1, ddf360962d7ab, -, 40) },
+    { HEX_DBL(+, 1, 288b01288b012, +, 0), HEX_DBL(-, 1, b2602497e, -, 3),
+      HEX_DBL(+, 1, 597f8a121640f, -, 40) },
+    { HEX_DBL(+, 1, 25e22708092f1, +, 0), HEX_DBL(-, 1, 97c1cb13d, -, 3),
+      HEX_DBL(+, 1, 02807d15580dc, -, 40) },
+    { HEX_DBL(+, 1, 23456789abcdf, +, 0), HEX_DBL(-, 1, 7d60496d, -, 3),
+      HEX_DBL(+, 1, 12ce913d7a827, -, 41) },
+    { HEX_DBL(+, 1, 20b470c67c0d8, +, 0), HEX_DBL(-, 1, 633a8bf44, -, 3),
+      HEX_DBL(+, 1, 0648bca9c96bd, -, 40) },
+    { HEX_DBL(+, 1, 1e2ef3b3fb874, +, 0), HEX_DBL(-, 1, 494f863b9, -, 3),
+      HEX_DBL(+, 1, 066fceb89b0eb, -, 42) },
+    { HEX_DBL(+, 1, 1bb4a4046ed29, +, 0), HEX_DBL(-, 1, 2f9e32d5c, -, 3),
+      HEX_DBL(+, 1, 17b8b6c4f846b, -, 46) },
+    { HEX_DBL(+, 1, 19453808ca29c, +, 0), HEX_DBL(-, 1, 162593187, -, 3),
+      HEX_DBL(+, 1, 2c83506452154, -, 42) },
+    { HEX_DBL(+, 1, 16e0689427378, +, 0), HEX_DBL(-, 1, f9c95dc1e, -, 4),
+      HEX_DBL(+, 1, dd5d2183150f3, -, 41) },
+    { HEX_DBL(+, 1, 1485f0e0acd3b, +, 0), HEX_DBL(-, 1, c7b528b72, -, 4),
+      HEX_DBL(+, 1, 0e43c4f4e619d, -, 40) },
+    { HEX_DBL(+, 1, 12358e75d3033, +, 0), HEX_DBL(-, 1, 960caf9ac, -, 4),
+      HEX_DBL(+, 1, 20fbfd5902a1e, -, 42) },
+    { HEX_DBL(+, 1, 0fef010fef01, +, 0), HEX_DBL(-, 1, 64ce26c08, -, 4),
+      HEX_DBL(+, 1, 8ebeefb4ac467, -, 40) },
+    { HEX_DBL(+, 1, 0db20a88f4695, +, 0), HEX_DBL(-, 1, 33f7cde16, -, 4),
+      HEX_DBL(+, 1, 30b3312da7a7d, -, 40) },
+    { HEX_DBL(+, 1, 0b7e6ec259dc7, +, 0), HEX_DBL(-, 1, 0387efbcc, -, 4),
+      HEX_DBL(+, 1, 796f1632949c3, -, 40) },
+    { HEX_DBL(+, 1, 0953f39010953, +, 0), HEX_DBL(-, 1, a6f9c378, -, 5),
+      HEX_DBL(+, 1, 1687e151172cc, -, 40) },
+    { HEX_DBL(+, 1, 073260a47f7c6, +, 0), HEX_DBL(-, 1, 47aa07358, -, 5),
+      HEX_DBL(+, 1, 1f87e4a9cc778, -, 42) },
+    { HEX_DBL(+, 1, 05197f7d73404, +, 0), HEX_DBL(-, 1, d23afc498, -, 6),
+      HEX_DBL(+, 1, b183a6b628487, -, 40) },
+    { HEX_DBL(+, 1, 03091b51f5e1a, +, 0), HEX_DBL(-, 1, 16a21e21, -, 6),
+      HEX_DBL(+, 1, 7d75c58973ce5, -, 40) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, f44659e4a4271, -, 1), HEX_DBL(+, 1, 11cd1d51, -, 5),
+      HEX_DBL(+, 1, 9a0d857e2f4b2, -, 40) },
+    { HEX_DBL(+, 1, ecc07b301ecc, -, 1), HEX_DBL(+, 1, c4dfab908, -, 5),
+      HEX_DBL(+, 1, 55b53fce557fd, -, 40) },
+    { HEX_DBL(+, 1, e573ac901e573, -, 1), HEX_DBL(+, 1, 3aa2fdd26, -, 4),
+      HEX_DBL(+, 1, f1cb0c9532089, -, 40) },
+    { HEX_DBL(+, 1, de5d6e3f8868a, -, 1), HEX_DBL(+, 1, 918a16e46, -, 4),
+      HEX_DBL(+, 1, 9af0dcd65a6e1, -, 43) },
+    { HEX_DBL(+, 1, d77b654b82c33, -, 1), HEX_DBL(+, 1, e72ec117e, -, 4),
+      HEX_DBL(+, 1, a5b93c4ebe124, -, 40) },
+    { HEX_DBL(+, 1, d0cb58f6ec074, -, 1), HEX_DBL(+, 1, 1dcd19755, -, 3),
+      HEX_DBL(+, 1, 5be50e71ddc6c, -, 42) },
+    { HEX_DBL(+, 1, ca4b3055ee191, -, 1), HEX_DBL(+, 1, 476a9f983, -, 3),
+      HEX_DBL(+, 1, ee9a798719e7f, -, 40) },
+    { HEX_DBL(+, 1, c3f8f01c3f8f, -, 1), HEX_DBL(+, 1, 70742d4ef, -, 3),
+      HEX_DBL(+, 1, 3ff1352c1219c, -, 46) },
+    { HEX_DBL(+, 1, bdd2b899406f7, -, 1), HEX_DBL(+, 1, 98edd077e, -, 3),
+      HEX_DBL(+, 1, c383cd11362f4, -, 41) },
+    { HEX_DBL(+, 1, b7d6c3dda338b, -, 1), HEX_DBL(+, 1, c0db6cdd9, -, 3),
+      HEX_DBL(+, 1, 37bd85b1a824e, -, 41) },
+    { HEX_DBL(+, 1, b2036406c80d9, -, 1), HEX_DBL(+, 1, e840be74e, -, 3),
+      HEX_DBL(+, 1, a9334d525e1ec, -, 41) },
+    { HEX_DBL(+, 1, ac5701ac5701a, -, 1), HEX_DBL(+, 1, 0790adbb, -, 2),
+      HEX_DBL(+, 1, 8060bfb6a491, -, 41) },
+    { HEX_DBL(+, 1, a6d01a6d01a6d, -, 1), HEX_DBL(+, 1, 1ac05b2918, -, 2),
+      HEX_DBL(+, 1, c1c161471580a, -, 40) },
+    { HEX_DBL(+, 1, a16d3f97a4b01, -, 1), HEX_DBL(+, 1, 2db10fc4d8, -, 2),
+      HEX_DBL(+, 1, ab1aa62214581, -, 42) },
+    { HEX_DBL(+, 1, 9c2d14ee4a101, -, 1), HEX_DBL(+, 1, 406463b1b, -, 2),
+      HEX_DBL(+, 1, 12e95dbda6611, -, 44) },
+    { HEX_DBL(+, 1, 970e4f80cb872, -, 1), HEX_DBL(+, 1, 52dbdfc4c8, -, 2),
+      HEX_DBL(+, 1, 6b53fee511af, -, 42) },
+    { HEX_DBL(+, 1, 920fb49d0e228, -, 1), HEX_DBL(+, 1, 6518fe467, -, 2),
+      HEX_DBL(+, 1, eea7d7d7d1764, -, 40) },
+    { HEX_DBL(+, 1, 8d3018d3018d3, -, 1), HEX_DBL(+, 1, 771d2ba7e8, -, 2),
+      HEX_DBL(+, 1, ecefa8d4fab97, -, 40) },
+    { HEX_DBL(+, 1, 886e5f0abb049, -, 1), HEX_DBL(+, 1, 88e9c72e08, -, 2),
+      HEX_DBL(+, 1, 913ea3d33fd14, -, 41) },
+    { HEX_DBL(+, 1, 83c977ab2bedd, -, 1), HEX_DBL(+, 1, 9a802391e, -, 2),
+      HEX_DBL(+, 1, 197e845877c94, -, 41) },
+    { HEX_DBL(+, 1, 7f405fd017f4, -, 1), HEX_DBL(+, 1, abe18797f, -, 2),
+      HEX_DBL(+, 1, f4a52f8e8a81, -, 42) },
+    { HEX_DBL(+, 1, 7ad2208e0ecc3, -, 1), HEX_DBL(+, 1, bd0f2e9e78, -, 2),
+      HEX_DBL(+, 1, 031f4336644cc, -, 42) },
+    { HEX_DBL(+, 1, 767dce434a9b1, -, 1), HEX_DBL(+, 1, ce0a4923a, -, 2),
+      HEX_DBL(+, 1, 61f33c897020c, -, 40) },
+    { HEX_DBL(+, 1, 724287f46debc, -, 1), HEX_DBL(+, 1, ded3fd442, -, 2),
+      HEX_DBL(+, 1, b2632e830632, -, 41) },
+    { HEX_DBL(+, 1, 6e1f76b4337c6, -, 1), HEX_DBL(+, 1, ef6d673288, -, 2),
+      HEX_DBL(+, 1, 888ec245a0bf, -, 40) },
+    { HEX_DBL(+, 1, 6a13cd153729, -, 1), HEX_DBL(+, 1, ffd799a838, -, 2),
+      HEX_DBL(+, 1, fe6f3b2f5fc8e, -, 40) },
+    { HEX_DBL(+, 1, 661ec6a5122f9, -, 1), HEX_DBL(+, 1, 0809cf27f4, -, 1),
+      HEX_DBL(+, 1, 81eaa9ef284dd, -, 40) },
+    { HEX_DBL(+, 1, 623fa7701623f, -, 1), HEX_DBL(+, 1, 10113b153c, -, 1),
+      HEX_DBL(+, 1, 1d7b07d6b1143, -, 42) },
+    { HEX_DBL(+, 1, 5e75bb8d015e7, -, 1), HEX_DBL(+, 1, 18028cf728, -, 1),
+      HEX_DBL(+, 1, 76b100b1f6c6, -, 41) },
+    { HEX_DBL(+, 1, 5ac056b015ac, -, 1), HEX_DBL(+, 1, 1fde3d30e8, -, 1),
+      HEX_DBL(+, 1, 26faeb9870945, -, 45) },
+    { HEX_DBL(+, 1, 571ed3c506b39, -, 1), HEX_DBL(+, 1, 27a4c0585c, -, 1),
+      HEX_DBL(+, 1, 7f2c5344d762b, -, 42) }
 };
 
 static double __loglTable2[64][3] = {
-{HEX_DBL( +, 1, 01fbe7f0a1be6, +, 0 ), HEX_DBL( -, 1, 6cf6ddd26112a, -,  7 ), HEX_DBL( +, 1, 0725e5755e314, -, 60 )},
-{HEX_DBL( +, 1, 01eba93a97b12, +, 0 ), HEX_DBL( -, 1, 6155b1d99f603, -,  7 ), HEX_DBL( +, 1, 4bcea073117f4, -, 60 )},
-{HEX_DBL( +, 1, 01db6c9029cd1, +, 0 ), HEX_DBL( -, 1, 55b54153137ff, -,  7 ), HEX_DBL( +, 1, 21e8faccad0ec, -, 61 )},
-{HEX_DBL( +, 1, 01cb31f0f534c, +, 0 ), HEX_DBL( -, 1, 4a158c27245bd, -,  7 ), HEX_DBL( +, 1, 1a5b7bfbf35d3, -, 60 )},
-{HEX_DBL( +, 1, 01baf95c9723c, +, 0 ), HEX_DBL( -, 1, 3e76923e3d678, -,  7 ), HEX_DBL( +, 1, eee400eb5fe34, -, 62 )},
-{HEX_DBL( +, 1, 01aac2d2acee6, +, 0 ), HEX_DBL( -, 1, 32d85380ce776, -,  7 ), HEX_DBL( +, 1, cbf7a513937bd, -, 61 )},
-{HEX_DBL( +, 1, 019a8e52d401e, +, 0 ), HEX_DBL( -, 1, 273acfd74be72, -,  7 ), HEX_DBL( +, 1, 5c64599efa5e6, -, 60 )},
-{HEX_DBL( +, 1, 018a5bdca9e42, +, 0 ), HEX_DBL( -, 1, 1b9e072a2e65,  -,  7 ), HEX_DBL( +, 1, 364180e0a5d37, -, 60 )},
-{HEX_DBL( +, 1, 017a2b6fcc33e, +, 0 ), HEX_DBL( -, 1, 1001f961f3243, -,  7 ), HEX_DBL( +, 1, 63d795746f216, -, 60 )},
-{HEX_DBL( +, 1, 0169fd0bd8a8a, +, 0 ), HEX_DBL( -, 1, 0466a6671bca4, -,  7 ), HEX_DBL( +, 1, 4c99ff1907435, -, 60 )},
-{HEX_DBL( +, 1, 0159d0b06d129, +, 0 ), HEX_DBL( -, 1, f1981c445cd05, -,  8 ), HEX_DBL( +, 1, 4bfff6366b723, -, 62 )},
-{HEX_DBL( +, 1, 0149a65d275a6, +, 0 ), HEX_DBL( -, 1, da6460f76ab8c, -,  8 ), HEX_DBL( +, 1, 9c5404f47589c, -, 61 )},
-{HEX_DBL( +, 1, 01397e11a581b, +, 0 ), HEX_DBL( -, 1, c3321ab87f4ef, -,  8 ), HEX_DBL( +, 1, c0da537429cea, -, 61 )},
-{HEX_DBL( +, 1, 012957cd85a28, +, 0 ), HEX_DBL( -, 1, ac014958c112c, -,  8 ), HEX_DBL( +, 1, 000c2a1b595e3, -, 64 )},
-{HEX_DBL( +, 1, 0119339065ef7, +, 0 ), HEX_DBL( -, 1, 94d1eca95f67a, -,  8 ), HEX_DBL( +, 1, d8d20b0564d5,  -, 61 )},
-{HEX_DBL( +, 1, 01091159e4b3d, +, 0 ), HEX_DBL( -, 1, 7da4047b92b3e, -,  8 ), HEX_DBL( +, 1, 6194a5d68cf2,  -, 66 )},
-{HEX_DBL( +, 1, 00f8f129a0535, +, 0 ), HEX_DBL( -, 1, 667790a09bf77, -,  8 ), HEX_DBL( +, 1, ca230e0bea645, -, 61 )},
-{HEX_DBL( +, 1, 00e8d2ff374a1, +, 0 ), HEX_DBL( -, 1, 4f4c90e9c4ead, -,  8 ), HEX_DBL( +, 1, 1de3e7f350c1,  -, 61 )},
-{HEX_DBL( +, 1, 00d8b6da482ce, +, 0 ), HEX_DBL( -, 1, 3823052860649, -,  8 ), HEX_DBL( +, 1, 5789b4c5891b8, -, 64 )},
-{HEX_DBL( +, 1, 00c89cba71a8c, +, 0 ), HEX_DBL( -, 1, 20faed2dc9a9e, -,  8 ), HEX_DBL( +, 1, 9e7c40f9839fd, -, 62 )},
-{HEX_DBL( +, 1, 00b8849f52834, +, 0 ), HEX_DBL( -, 1, 09d448cb65014, -,  8 ), HEX_DBL( +, 1, 387e3e9b6d02,  -, 62 )},
-{HEX_DBL( +, 1, 00a86e88899a4, +, 0 ), HEX_DBL( -, 1, e55e2fa53ebf1, -,  9 ), HEX_DBL( +, 1, cdaa71fddfddf, -, 62 )},
-{HEX_DBL( +, 1, 00985a75b5e3f, +, 0 ), HEX_DBL( -, 1, b716b429dce0f, -,  9 ), HEX_DBL( +, 1, 2f2af081367bf, -, 63 )},
-{HEX_DBL( +, 1, 00884866766ee, +, 0 ), HEX_DBL( -, 1, 88d21ec7a16d7, -,  9 ), HEX_DBL( +, 1, fb95c228d6f16, -, 62 )},
-{HEX_DBL( +, 1, 0078385a6a61d, +, 0 ), HEX_DBL( -, 1, 5a906f219a9e8, -,  9 ), HEX_DBL( +, 1, 18aff10a89f29, -, 64 )},
-{HEX_DBL( +, 1, 00682a5130fbe, +, 0 ), HEX_DBL( -, 1, 2c51a4dae87f1, -,  9 ), HEX_DBL( +, 1, bcc7e33ddde3,  -, 63 )},
-{HEX_DBL( +, 1, 00581e4a69944, +, 0 ), HEX_DBL( -, 1, fc2b7f2d782b1, -, 10 ), HEX_DBL( +, 1, fe3ef3300a9fa, -, 64 )},
-{HEX_DBL( +, 1, 00481445b39a8, +, 0 ), HEX_DBL( -, 1, 9fb97df0b0b83, -, 10 ), HEX_DBL( +, 1, 0d9a601f2f324, -, 65 )},
-{HEX_DBL( +, 1, 00380c42ae963, +, 0 ), HEX_DBL( -, 1, 434d4546227ae, -, 10 ), HEX_DBL( +, 1, 0b9b6a5868f33, -, 63 )},
-{HEX_DBL( +, 1, 00280640fa271, +, 0 ), HEX_DBL( -, 1, cdcda8e930c19, -, 11 ), HEX_DBL( +, 1, 3d424ab39f789, -, 64 )},
-{HEX_DBL( +, 1, 0018024036051, +, 0 ), HEX_DBL( -, 1, 150c558601261, -, 11 ), HEX_DBL( +, 1, 285bb90327a0f, -, 64 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, ffa011fca0a1e, -, 1 ), HEX_DBL( +, 1, 14e5640c4197b, -, 10 ), HEX_DBL( +, 1, 95728136ae401, -, 63 )},
-{HEX_DBL( +, 1, ff6031f064e07, -, 1 ), HEX_DBL( +, 1, cd61806bf532d, -, 10 ), HEX_DBL( +, 1, 568a4f35d8538, -, 63 )},
-{HEX_DBL( +, 1, ff2061d532b9c, -, 1 ), HEX_DBL( +, 1, 42e34af550eda, -,  9 ), HEX_DBL( +, 1, 8f69cee55fec,  -, 62 )},
-{HEX_DBL( +, 1, fee0a1a513253, -, 1 ), HEX_DBL( +, 1, 9f0a5523902ea, -,  9 ), HEX_DBL( +, 1, daec734b11615, -, 63 )},
-{HEX_DBL( +, 1, fea0f15a12139, -, 1 ), HEX_DBL( +, 1, fb25e19f11b26, -,  9 ), HEX_DBL( +, 1, 8bafca62941da, -, 62 )},
-{HEX_DBL( +, 1, fe6150ee3e6d4, -, 1 ), HEX_DBL( +, 1, 2b9af9a28e282, -,  8 ), HEX_DBL( +, 1, 0fd3674e1dc5b, -, 61 )},
-{HEX_DBL( +, 1, fe21c05baa109, -, 1 ), HEX_DBL( +, 1, 599d4678f24b9, -,  8 ), HEX_DBL( +, 1, dafce1f09937b, -, 61 )},
-{HEX_DBL( +, 1, fde23f9c69cf9, -, 1 ), HEX_DBL( +, 1, 8799d8c046eb,  -,  8 ), HEX_DBL( +, 1, ffa0ce0bdd217, -, 65 )},
-{HEX_DBL( +, 1, fda2ceaa956e8, -, 1 ), HEX_DBL( +, 1, b590b1e5951ee, -,  8 ), HEX_DBL( +, 1, 645a769232446, -, 62 )},
-{HEX_DBL( +, 1, fd636d8047a1f, -, 1 ), HEX_DBL( +, 1, e381d3555dbcf, -,  8 ), HEX_DBL( +, 1, 882320d368331, -, 61 )},
-{HEX_DBL( +, 1, fd241c179e0cc, -, 1 ), HEX_DBL( +, 1, 08b69f3dccde,  -,  7 ), HEX_DBL( +, 1, 01ad5065aba9e, -, 61 )},
-{HEX_DBL( +, 1, fce4da6ab93e8, -, 1 ), HEX_DBL( +, 1, 1fa97a61dd298, -,  7 ), HEX_DBL( +, 1, 84cd1f931ae34, -, 60 )},
-{HEX_DBL( +, 1, fca5a873bcb19, -, 1 ), HEX_DBL( +, 1, 36997bcc54a3f, -,  7 ), HEX_DBL( +, 1, 1485e97eaee03, -, 60 )},
-{HEX_DBL( +, 1, fc66862ccec93, -, 1 ), HEX_DBL( +, 1, 4d86a43264a4f, -,  7 ), HEX_DBL( +, 1, c75e63370988b, -, 61 )},
-{HEX_DBL( +, 1, fc27739018cfe, -, 1 ), HEX_DBL( +, 1, 6470f448fb09d, -,  7 ), HEX_DBL( +, 1, d7361eeaed0a1, -, 65 )},
-{HEX_DBL( +, 1, fbe87097c6f5a, -, 1 ), HEX_DBL( +, 1, 7b586cc4c2523, -,  7 ), HEX_DBL( +, 1, b3df952cc473c, -, 61 )},
-{HEX_DBL( +, 1, fba97d3e084dd, -, 1 ), HEX_DBL( +, 1, 923d0e5a21e06, -,  7 ), HEX_DBL( +, 1, cf56c7b64ae5d, -, 62 )},
-{HEX_DBL( +, 1, fb6a997d0ecdc, -, 1 ), HEX_DBL( +, 1, a91ed9bd3df9a, -,  7 ), HEX_DBL( +, 1, b957bdcd89e43, -, 61 )},
-{HEX_DBL( +, 1, fb2bc54f0f4ab, -, 1 ), HEX_DBL( +, 1, bffdcfa1f7fbb, -,  7 ), HEX_DBL( +, 1, ea8cad9a21771, -, 62 )},
-{HEX_DBL( +, 1, faed00ae41783, -, 1 ), HEX_DBL( +, 1, d6d9f0bbee6f6, -,  7 ), HEX_DBL( +, 1, 5762a9af89c82, -, 60 )},
-{HEX_DBL( +, 1, faae4b94dfe64, -, 1 ), HEX_DBL( +, 1, edb33dbe7d335, -,  7 ), HEX_DBL( +, 1, 21e24fc245697, -, 62 )},
-{HEX_DBL( +, 1, fa6fa5fd27ff8, -, 1 ), HEX_DBL( +, 1, 0244dbae5ed05, -,  6 ), HEX_DBL( +, 1, 12ef51b967102, -, 60 )},
-{HEX_DBL( +, 1, fa310fe15a078, -, 1 ), HEX_DBL( +, 1, 0daeaf24c3529, -,  6 ), HEX_DBL( +, 1, 10d3cfca60b45, -, 59 )},
-{HEX_DBL( +, 1, f9f2893bb9192, -, 1 ), HEX_DBL( +, 1, 1917199bb66bc, -,  6 ), HEX_DBL( +, 1, 6cf6034c32e19, -, 60 )},
-{HEX_DBL( +, 1, f9b412068b247, -, 1 ), HEX_DBL( +, 1, 247e1b6c615d5, -,  6 ), HEX_DBL( +, 1, 42f0fffa229f7, -, 61 )},
-{HEX_DBL( +, 1, f975aa3c18ed6, -, 1 ), HEX_DBL( +, 1, 2fe3b4efcc5ad, -,  6 ), HEX_DBL( +, 1, 70106136a8919, -, 60 )},
-{HEX_DBL( +, 1, f93751d6ae09b, -, 1 ), HEX_DBL( +, 1, 3b47e67edea93, -,  6 ), HEX_DBL( +, 1, 38dd5a4f6959a, -, 59 )},
-{HEX_DBL( +, 1, f8f908d098df6, -, 1 ), HEX_DBL( +, 1, 46aab0725ea6c, -,  6 ), HEX_DBL( +, 1, 821fc1e799e01, -, 60 )},
-{HEX_DBL( +, 1, f8bacf242aa2c, -, 1 ), HEX_DBL( +, 1, 520c1322f1e4e, -,  6 ), HEX_DBL( +, 1, 129dcda3ad563, -, 60 )},
-{HEX_DBL( +, 1, f87ca4cbb755,  -, 1 ), HEX_DBL( +, 1, 5d6c0ee91d2ab, -,  6 ), HEX_DBL( +, 1, c5b190c04606e, -, 62 )},
-{HEX_DBL( +, 1, f83e89c195c25, -, 1 ), HEX_DBL( +, 1, 68caa41d448c3, -,  6 ), HEX_DBL( +, 1, 4723441195ac9, -, 59 )}
+    { HEX_DBL(+, 1, 01fbe7f0a1be6, +, 0), HEX_DBL(-, 1, 6cf6ddd26112a, -, 7),
+      HEX_DBL(+, 1, 0725e5755e314, -, 60) },
+    { HEX_DBL(+, 1, 01eba93a97b12, +, 0), HEX_DBL(-, 1, 6155b1d99f603, -, 7),
+      HEX_DBL(+, 1, 4bcea073117f4, -, 60) },
+    { HEX_DBL(+, 1, 01db6c9029cd1, +, 0), HEX_DBL(-, 1, 55b54153137ff, -, 7),
+      HEX_DBL(+, 1, 21e8faccad0ec, -, 61) },
+    { HEX_DBL(+, 1, 01cb31f0f534c, +, 0), HEX_DBL(-, 1, 4a158c27245bd, -, 7),
+      HEX_DBL(+, 1, 1a5b7bfbf35d3, -, 60) },
+    { HEX_DBL(+, 1, 01baf95c9723c, +, 0), HEX_DBL(-, 1, 3e76923e3d678, -, 7),
+      HEX_DBL(+, 1, eee400eb5fe34, -, 62) },
+    { HEX_DBL(+, 1, 01aac2d2acee6, +, 0), HEX_DBL(-, 1, 32d85380ce776, -, 7),
+      HEX_DBL(+, 1, cbf7a513937bd, -, 61) },
+    { HEX_DBL(+, 1, 019a8e52d401e, +, 0), HEX_DBL(-, 1, 273acfd74be72, -, 7),
+      HEX_DBL(+, 1, 5c64599efa5e6, -, 60) },
+    { HEX_DBL(+, 1, 018a5bdca9e42, +, 0), HEX_DBL(-, 1, 1b9e072a2e65, -, 7),
+      HEX_DBL(+, 1, 364180e0a5d37, -, 60) },
+    { HEX_DBL(+, 1, 017a2b6fcc33e, +, 0), HEX_DBL(-, 1, 1001f961f3243, -, 7),
+      HEX_DBL(+, 1, 63d795746f216, -, 60) },
+    { HEX_DBL(+, 1, 0169fd0bd8a8a, +, 0), HEX_DBL(-, 1, 0466a6671bca4, -, 7),
+      HEX_DBL(+, 1, 4c99ff1907435, -, 60) },
+    { HEX_DBL(+, 1, 0159d0b06d129, +, 0), HEX_DBL(-, 1, f1981c445cd05, -, 8),
+      HEX_DBL(+, 1, 4bfff6366b723, -, 62) },
+    { HEX_DBL(+, 1, 0149a65d275a6, +, 0), HEX_DBL(-, 1, da6460f76ab8c, -, 8),
+      HEX_DBL(+, 1, 9c5404f47589c, -, 61) },
+    { HEX_DBL(+, 1, 01397e11a581b, +, 0), HEX_DBL(-, 1, c3321ab87f4ef, -, 8),
+      HEX_DBL(+, 1, c0da537429cea, -, 61) },
+    { HEX_DBL(+, 1, 012957cd85a28, +, 0), HEX_DBL(-, 1, ac014958c112c, -, 8),
+      HEX_DBL(+, 1, 000c2a1b595e3, -, 64) },
+    { HEX_DBL(+, 1, 0119339065ef7, +, 0), HEX_DBL(-, 1, 94d1eca95f67a, -, 8),
+      HEX_DBL(+, 1, d8d20b0564d5, -, 61) },
+    { HEX_DBL(+, 1, 01091159e4b3d, +, 0), HEX_DBL(-, 1, 7da4047b92b3e, -, 8),
+      HEX_DBL(+, 1, 6194a5d68cf2, -, 66) },
+    { HEX_DBL(+, 1, 00f8f129a0535, +, 0), HEX_DBL(-, 1, 667790a09bf77, -, 8),
+      HEX_DBL(+, 1, ca230e0bea645, -, 61) },
+    { HEX_DBL(+, 1, 00e8d2ff374a1, +, 0), HEX_DBL(-, 1, 4f4c90e9c4ead, -, 8),
+      HEX_DBL(+, 1, 1de3e7f350c1, -, 61) },
+    { HEX_DBL(+, 1, 00d8b6da482ce, +, 0), HEX_DBL(-, 1, 3823052860649, -, 8),
+      HEX_DBL(+, 1, 5789b4c5891b8, -, 64) },
+    { HEX_DBL(+, 1, 00c89cba71a8c, +, 0), HEX_DBL(-, 1, 20faed2dc9a9e, -, 8),
+      HEX_DBL(+, 1, 9e7c40f9839fd, -, 62) },
+    { HEX_DBL(+, 1, 00b8849f52834, +, 0), HEX_DBL(-, 1, 09d448cb65014, -, 8),
+      HEX_DBL(+, 1, 387e3e9b6d02, -, 62) },
+    { HEX_DBL(+, 1, 00a86e88899a4, +, 0), HEX_DBL(-, 1, e55e2fa53ebf1, -, 9),
+      HEX_DBL(+, 1, cdaa71fddfddf, -, 62) },
+    { HEX_DBL(+, 1, 00985a75b5e3f, +, 0), HEX_DBL(-, 1, b716b429dce0f, -, 9),
+      HEX_DBL(+, 1, 2f2af081367bf, -, 63) },
+    { HEX_DBL(+, 1, 00884866766ee, +, 0), HEX_DBL(-, 1, 88d21ec7a16d7, -, 9),
+      HEX_DBL(+, 1, fb95c228d6f16, -, 62) },
+    { HEX_DBL(+, 1, 0078385a6a61d, +, 0), HEX_DBL(-, 1, 5a906f219a9e8, -, 9),
+      HEX_DBL(+, 1, 18aff10a89f29, -, 64) },
+    { HEX_DBL(+, 1, 00682a5130fbe, +, 0), HEX_DBL(-, 1, 2c51a4dae87f1, -, 9),
+      HEX_DBL(+, 1, bcc7e33ddde3, -, 63) },
+    { HEX_DBL(+, 1, 00581e4a69944, +, 0), HEX_DBL(-, 1, fc2b7f2d782b1, -, 10),
+      HEX_DBL(+, 1, fe3ef3300a9fa, -, 64) },
+    { HEX_DBL(+, 1, 00481445b39a8, +, 0), HEX_DBL(-, 1, 9fb97df0b0b83, -, 10),
+      HEX_DBL(+, 1, 0d9a601f2f324, -, 65) },
+    { HEX_DBL(+, 1, 00380c42ae963, +, 0), HEX_DBL(-, 1, 434d4546227ae, -, 10),
+      HEX_DBL(+, 1, 0b9b6a5868f33, -, 63) },
+    { HEX_DBL(+, 1, 00280640fa271, +, 0), HEX_DBL(-, 1, cdcda8e930c19, -, 11),
+      HEX_DBL(+, 1, 3d424ab39f789, -, 64) },
+    { HEX_DBL(+, 1, 0018024036051, +, 0), HEX_DBL(-, 1, 150c558601261, -, 11),
+      HEX_DBL(+, 1, 285bb90327a0f, -, 64) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, ffa011fca0a1e, -, 1), HEX_DBL(+, 1, 14e5640c4197b, -, 10),
+      HEX_DBL(+, 1, 95728136ae401, -, 63) },
+    { HEX_DBL(+, 1, ff6031f064e07, -, 1), HEX_DBL(+, 1, cd61806bf532d, -, 10),
+      HEX_DBL(+, 1, 568a4f35d8538, -, 63) },
+    { HEX_DBL(+, 1, ff2061d532b9c, -, 1), HEX_DBL(+, 1, 42e34af550eda, -, 9),
+      HEX_DBL(+, 1, 8f69cee55fec, -, 62) },
+    { HEX_DBL(+, 1, fee0a1a513253, -, 1), HEX_DBL(+, 1, 9f0a5523902ea, -, 9),
+      HEX_DBL(+, 1, daec734b11615, -, 63) },
+    { HEX_DBL(+, 1, fea0f15a12139, -, 1), HEX_DBL(+, 1, fb25e19f11b26, -, 9),
+      HEX_DBL(+, 1, 8bafca62941da, -, 62) },
+    { HEX_DBL(+, 1, fe6150ee3e6d4, -, 1), HEX_DBL(+, 1, 2b9af9a28e282, -, 8),
+      HEX_DBL(+, 1, 0fd3674e1dc5b, -, 61) },
+    { HEX_DBL(+, 1, fe21c05baa109, -, 1), HEX_DBL(+, 1, 599d4678f24b9, -, 8),
+      HEX_DBL(+, 1, dafce1f09937b, -, 61) },
+    { HEX_DBL(+, 1, fde23f9c69cf9, -, 1), HEX_DBL(+, 1, 8799d8c046eb, -, 8),
+      HEX_DBL(+, 1, ffa0ce0bdd217, -, 65) },
+    { HEX_DBL(+, 1, fda2ceaa956e8, -, 1), HEX_DBL(+, 1, b590b1e5951ee, -, 8),
+      HEX_DBL(+, 1, 645a769232446, -, 62) },
+    { HEX_DBL(+, 1, fd636d8047a1f, -, 1), HEX_DBL(+, 1, e381d3555dbcf, -, 8),
+      HEX_DBL(+, 1, 882320d368331, -, 61) },
+    { HEX_DBL(+, 1, fd241c179e0cc, -, 1), HEX_DBL(+, 1, 08b69f3dccde, -, 7),
+      HEX_DBL(+, 1, 01ad5065aba9e, -, 61) },
+    { HEX_DBL(+, 1, fce4da6ab93e8, -, 1), HEX_DBL(+, 1, 1fa97a61dd298, -, 7),
+      HEX_DBL(+, 1, 84cd1f931ae34, -, 60) },
+    { HEX_DBL(+, 1, fca5a873bcb19, -, 1), HEX_DBL(+, 1, 36997bcc54a3f, -, 7),
+      HEX_DBL(+, 1, 1485e97eaee03, -, 60) },
+    { HEX_DBL(+, 1, fc66862ccec93, -, 1), HEX_DBL(+, 1, 4d86a43264a4f, -, 7),
+      HEX_DBL(+, 1, c75e63370988b, -, 61) },
+    { HEX_DBL(+, 1, fc27739018cfe, -, 1), HEX_DBL(+, 1, 6470f448fb09d, -, 7),
+      HEX_DBL(+, 1, d7361eeaed0a1, -, 65) },
+    { HEX_DBL(+, 1, fbe87097c6f5a, -, 1), HEX_DBL(+, 1, 7b586cc4c2523, -, 7),
+      HEX_DBL(+, 1, b3df952cc473c, -, 61) },
+    { HEX_DBL(+, 1, fba97d3e084dd, -, 1), HEX_DBL(+, 1, 923d0e5a21e06, -, 7),
+      HEX_DBL(+, 1, cf56c7b64ae5d, -, 62) },
+    { HEX_DBL(+, 1, fb6a997d0ecdc, -, 1), HEX_DBL(+, 1, a91ed9bd3df9a, -, 7),
+      HEX_DBL(+, 1, b957bdcd89e43, -, 61) },
+    { HEX_DBL(+, 1, fb2bc54f0f4ab, -, 1), HEX_DBL(+, 1, bffdcfa1f7fbb, -, 7),
+      HEX_DBL(+, 1, ea8cad9a21771, -, 62) },
+    { HEX_DBL(+, 1, faed00ae41783, -, 1), HEX_DBL(+, 1, d6d9f0bbee6f6, -, 7),
+      HEX_DBL(+, 1, 5762a9af89c82, -, 60) },
+    { HEX_DBL(+, 1, faae4b94dfe64, -, 1), HEX_DBL(+, 1, edb33dbe7d335, -, 7),
+      HEX_DBL(+, 1, 21e24fc245697, -, 62) },
+    { HEX_DBL(+, 1, fa6fa5fd27ff8, -, 1), HEX_DBL(+, 1, 0244dbae5ed05, -, 6),
+      HEX_DBL(+, 1, 12ef51b967102, -, 60) },
+    { HEX_DBL(+, 1, fa310fe15a078, -, 1), HEX_DBL(+, 1, 0daeaf24c3529, -, 6),
+      HEX_DBL(+, 1, 10d3cfca60b45, -, 59) },
+    { HEX_DBL(+, 1, f9f2893bb9192, -, 1), HEX_DBL(+, 1, 1917199bb66bc, -, 6),
+      HEX_DBL(+, 1, 6cf6034c32e19, -, 60) },
+    { HEX_DBL(+, 1, f9b412068b247, -, 1), HEX_DBL(+, 1, 247e1b6c615d5, -, 6),
+      HEX_DBL(+, 1, 42f0fffa229f7, -, 61) },
+    { HEX_DBL(+, 1, f975aa3c18ed6, -, 1), HEX_DBL(+, 1, 2fe3b4efcc5ad, -, 6),
+      HEX_DBL(+, 1, 70106136a8919, -, 60) },
+    { HEX_DBL(+, 1, f93751d6ae09b, -, 1), HEX_DBL(+, 1, 3b47e67edea93, -, 6),
+      HEX_DBL(+, 1, 38dd5a4f6959a, -, 59) },
+    { HEX_DBL(+, 1, f8f908d098df6, -, 1), HEX_DBL(+, 1, 46aab0725ea6c, -, 6),
+      HEX_DBL(+, 1, 821fc1e799e01, -, 60) },
+    { HEX_DBL(+, 1, f8bacf242aa2c, -, 1), HEX_DBL(+, 1, 520c1322f1e4e, -, 6),
+      HEX_DBL(+, 1, 129dcda3ad563, -, 60) },
+    { HEX_DBL(+, 1, f87ca4cbb755, -, 1), HEX_DBL(+, 1, 5d6c0ee91d2ab, -, 6),
+      HEX_DBL(+, 1, c5b190c04606e, -, 62) },
+    { HEX_DBL(+, 1, f83e89c195c25, -, 1), HEX_DBL(+, 1, 68caa41d448c3, -, 6),
+      HEX_DBL(+, 1, 4723441195ac9, -, 59) }
 };
 
 static double __loglTable3[8][3] = {
-{HEX_DBL( +, 1, 000e00c40ab89, +, 0 ), HEX_DBL( -, 1, 4332be0032168, -, 12 ), HEX_DBL( +, 1, a1003588d217a, -, 65 )},
-{HEX_DBL( +, 1, 000a006403e82, +, 0 ), HEX_DBL( -, 1, cdb2987366fcc, -, 13 ), HEX_DBL( +, 1, 5c86001294bbc, -, 67 )},
-{HEX_DBL( +, 1, 0006002400d8,  +, 0 ), HEX_DBL( -, 1, 150297c90fa6f, -, 13 ), HEX_DBL( +, 1, 01fb4865fae32, -, 66 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, 0,             +, 0 ), HEX_DBL( +, 0, 0,             +,  0 ), HEX_DBL( +, 0, 0,             +,  0 )},
-{HEX_DBL( +, 1, ffe8011ff280a, -, 1 ), HEX_DBL( +, 1, 14f8daf5e3d3b, -, 12 ), HEX_DBL( +, 1, 3c933b4b6b914, -, 68 )},
-{HEX_DBL( +, 1, ffd8031fc184e, -, 1 ), HEX_DBL( +, 1, cd978c38042bb, -, 12 ), HEX_DBL( +, 1, 10f8e642e66fd, -, 65 )},
-{HEX_DBL( +, 1, ffc8061f5492b, -, 1 ), HEX_DBL( +, 1, 43183c878274e, -, 11 ), HEX_DBL( +, 1, 5885dd1eb6582, -, 65 )}
+    { HEX_DBL(+, 1, 000e00c40ab89, +, 0), HEX_DBL(-, 1, 4332be0032168, -, 12),
+      HEX_DBL(+, 1, a1003588d217a, -, 65) },
+    { HEX_DBL(+, 1, 000a006403e82, +, 0), HEX_DBL(-, 1, cdb2987366fcc, -, 13),
+      HEX_DBL(+, 1, 5c86001294bbc, -, 67) },
+    { HEX_DBL(+, 1, 0006002400d8, +, 0), HEX_DBL(-, 1, 150297c90fa6f, -, 13),
+      HEX_DBL(+, 1, 01fb4865fae32, -, 66) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) },
+    { HEX_DBL(+, 1, ffe8011ff280a, -, 1), HEX_DBL(+, 1, 14f8daf5e3d3b, -, 12),
+      HEX_DBL(+, 1, 3c933b4b6b914, -, 68) },
+    { HEX_DBL(+, 1, ffd8031fc184e, -, 1), HEX_DBL(+, 1, cd978c38042bb, -, 12),
+      HEX_DBL(+, 1, 10f8e642e66fd, -, 65) },
+    { HEX_DBL(+, 1, ffc8061f5492b, -, 1), HEX_DBL(+, 1, 43183c878274e, -, 11),
+      HEX_DBL(+, 1, 5885dd1eb6582, -, 65) }
 };
 
 static void __log2_ep(double *hi, double *lo, double x)
 {
-    union { uint64_t i; double d; } uu;
+    union {
+        uint64_t i;
+        double d;
+    } uu;
 
     int m;
     double f = reference_frexp(x, &m);
 
     // bring f in [0.75, 1.5)
-    if( f < 0.75 ) {
+    if (f < 0.75)
+    {
         f *= 2.0;
         m -= 1;
     }
 
     // index first table .... brings down to [1-2^-7, 1+2^6)
     uu.d = f;
-    int index = (int) (((uu.i + ((uint64_t) 1 << 51)) & 0x000fc00000000000ULL) >> 46);
+    int index =
+        (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46);
     double r1 = __loglTable1[index][0];
     double logr1hi = __loglTable1[index][1];
     double logr1lo = __loglTable1[index][2];
-    // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since |m| <= 1023
-    // which needs 10bits at max, we can directly add m to log1hi without spilling
+    // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since
+    // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi
+    // without spilling
     logr1hi += m;
 
-    // argument reduction needs to be in double-double since reduced argument will form the
-    // leading term of polynomial approximation which sets the precision we eventually achieve
+    // argument reduction needs to be in double-double since reduced argument
+    // will form the leading term of polynomial approximation which sets the
+    // precision we eventually achieve
     double zhi, zlo;
     MulD(&zhi, &zlo, r1, uu.d);
 
     // second index table .... brings down to [1-2^-12, 1+2^-11)
     uu.d = zhi;
-    index = (int) (((uu.i + ((uint64_t) 1 << 46)) & 0x00007e0000000000ULL) >> 41);
+    index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41);
     double r2 = __loglTable2[index][0];
     double logr2hi = __loglTable2[index][1];
     double logr2lo = __loglTable2[index][2];
@@ -4186,11 +4754,12 @@ static void __log2_ep(double *hi, double *lo, double x)
     // Actually reduction to 2^-11 would have been sufficient to calculate
     // second order term in polynomial in double rather than double-double, I
     // reduced it a bit more to make sure other systematic arithmetic errors
-    // are guarded against .... also this allow lower order product of leading polynomial
-    // term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather than double-double ...
-    // hence only term that needs to be done in double-double is Ao_hi*z_hi
+    // are guarded against .... also this allow lower order product of leading
+    // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather
+    // than double-double ... hence only term that needs to be done in
+    // double-double is Ao_hi*z_hi
     uu.d = zhi;
-    index = (int) (((uu.i + ((uint64_t) 1 << 41)) & 0x0000038000000000ULL) >> 39);
+    index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39);
     double r3 = __loglTable3[index][0];
     double logr3hi = __loglTable3[index][1];
     double logr3lo = __loglTable3[index][2];
@@ -4202,34 +4771,36 @@ static void __log2_ep(double *hi, double *lo, double x)
     AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo);
     AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo);
 
-    // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after this
+    // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after
+    // this
     MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0);
-    // we dont need to do full double-double substract here. substracting 1.0 for higher
-    // term is exact
+    // we dont need to do full double-double substract here. substracting 1.0
+    // for higher term is exact
     zhi = zhi - 1.0;
     // normalize
     AddD(&zhi, &zlo, zhi, zlo);
 
     // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit
-    // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in [-2^-14, 2^-13]
-    // log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
+    // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in
+    // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4
     // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5
-    // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 + A4*z^3)
-    // since we are looking for at least 64 digits of precision and z in [-2^-14, 2^-13], final term
-    // can be done in double .... also Aolo*zhi + Aohi*zlo can be done in double ....
-    // Aohi*zhi needs to be done in double-double
+    // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 +
+    // A4*z^3) since we are looking for at least 64 digits of precision and z in
+    // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi +
+    // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in
+    // double-double
 
-    double Aohi = HEX_DBL( +, 1, 71547652b82fe, +, 0 );
-    double Aolo = HEX_DBL( +, 1, 777c9cbb675c, -, 56 );
+    double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0);
+    double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56);
     double y;
-    y = HEX_DBL( +, 1, 276d2736fade7, -, 2 );
-    y = HEX_DBL( -, 1, 7154765782df1, -, 2 ) + y*zhi;
-    y = HEX_DBL( +, 1, ec709dc3a0f67, -, 2 ) + y*zhi;
-    y = HEX_DBL( -, 1, 71547652b82fe, -, 1 ) + y*zhi;
-    double zhisq = zhi*zhi;
-    y = y*zhisq;
-    y = y + zhi*Aolo;
-    y = y + zlo*Aohi;
+    y = HEX_DBL(+, 1, 276d2736fade7, -, 2);
+    y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi;
+    y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi;
+    y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi;
+    double zhisq = zhi * zhi;
+    y = y * zhisq;
+    y = y + zhi * Aolo;
+    y = y + zlo * Aohi;
 
     MulD(&zhi, &zlo, Aohi, zhi);
     AddDD(&zhi, &zlo, zhi, zlo, y, 0.0);
@@ -4239,7 +4810,7 @@ static void __log2_ep(double *hi, double *lo, double x)
     *lo = zlo;
 }
 
-long double reference_powl( long double x, long double y )
+long double reference_powl(long double x, long double y)
 {
 
 
@@ -4256,174 +4827,166 @@ long double reference_powl( long double x, long double y )
     // causes errors. So we need to tread y as long double and convert it
     // to hi, lo doubles when performing y*log2(x).
 
-//    double x = (double) xx;
-//    double y = (double) yy;
+    //    double x = (double) xx;
+    //    double y = (double) yy;
 
-    static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 );
+    static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
 
-    //if x = 1, return x for any y, even NaN
-    if( x == 1.0 )
-        return x;
+    // if x = 1, return x for any y, even NaN
+    if (x == 1.0) return x;
 
-    //if y == 0, return 1 for any x, even NaN
-    if( y == 0.0 )
-        return 1.0L;
+    // if y == 0, return 1 for any x, even NaN
+    if (y == 0.0) return 1.0L;
 
-    //get NaNs out of the way
-    if( x != x  || y != y )
-        return x + y;
+    // get NaNs out of the way
+    if (x != x || y != y) return x + y;
 
-    //do the work required to sort out edge cases
-    double fabsy = reference_fabs( y );
-    double fabsx = reference_fabs( x );
-    double iy = reference_rint( fabsy );            //we do round to nearest here so that |fy| <= 0.5
-    if( iy > fabsy )//convert nearbyint to floor
+    // do the work required to sort out edge cases
+    double fabsy = reference_fabs(y);
+    double fabsx = reference_fabs(x);
+    double iy = reference_rint(
+        fabsy); // we do round to nearest here so that |fy| <= 0.5
+    if (iy > fabsy) // convert nearbyint to floor
         iy -= 1.0;
     int isOddInt = 0;
-    if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon )
-        isOddInt =     (int) (iy - 2.0 * rint( 0.5 * iy ));        //might be 0, -1, or 1
+    if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
+        isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
 
-    ///test a few more edge cases
-    //deal with x == 0 cases
-    if( x == 0.0 )
+    /// test a few more edge cases
+    // deal with x == 0 cases
+    if (x == 0.0)
     {
-        if( ! isOddInt )
-            x = 0.0;
+        if (!isOddInt) x = 0.0;
 
-        if( y < 0 )
-            x = 1.0/ x;
+        if (y < 0) x = 1.0 / x;
 
         return x;
     }
 
-    //x == +-Inf cases
-    if( isinf(fabsx) )
+    // x == +-Inf cases
+    if (isinf(fabsx))
     {
-        if( x < 0 )
+        if (x < 0)
         {
-            if( isOddInt )
+            if (isOddInt)
             {
-                if( y < 0 )
+                if (y < 0)
                     return -0.0;
                 else
                     return -INFINITY;
             }
             else
             {
-                if( y < 0 )
+                if (y < 0)
                     return 0.0;
                 else
                     return INFINITY;
             }
         }
 
-        if( y < 0 )
-            return 0;
+        if (y < 0) return 0;
         return INFINITY;
     }
 
-    //y = +-inf cases
-    if( isinf(fabsy) )
+    // y = +-inf cases
+    if (isinf(fabsy))
     {
-        if( x == -1 )
-            return 1;
+        if (x == -1) return 1;
 
-        if( y < 0 )
+        if (y < 0)
         {
-            if( fabsx < 1 )
-                return INFINITY;
+            if (fabsx < 1) return INFINITY;
             return 0;
         }
-        if( fabsx < 1 )
-            return 0;
+        if (fabsx < 1) return 0;
         return INFINITY;
     }
 
     // x < 0 and y non integer case
-    if( x < 0 && iy != fabsy )
+    if (x < 0 && iy != fabsy)
     {
-        //return nan;
+        // return nan;
         return cl_make_nan();
     }
 
-    //speedy resolution of sqrt and reciprocal sqrt
-    if( fabsy == 0.5 )
+    // speedy resolution of sqrt and reciprocal sqrt
+    if (fabsy == 0.5)
     {
-        long double xl = sqrtl( x );
-        if( y < 0 )
-            xl = 1.0/ xl;
+        long double xl = sqrtl(x);
+        if (y < 0) xl = 1.0 / xl;
         return xl;
     }
 
     double log2x_hi, log2x_lo;
 
-    // extended precision log .... accurate to at least 64-bits + couple of guard bits
+    // extended precision log .... accurate to at least 64-bits + couple of
+    // guard bits
     __log2_ep(&log2x_hi, &log2x_lo, fabsx);
 
     double ylog2x_hi, ylog2x_lo;
 
-    double y_hi = (double) y;
-    double y_lo = (double) ( y - (long double) y_hi);
+    double y_hi = (double)y;
+    double y_lo = (double)(y - (long double)y_hi);
 
     // compute product of y*log2(x)
     // scale to avoid overflow in double-double multiplication
-    if( reference_fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) {
+    if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970))
+    {
         y_hi = reference_ldexp(y_hi, -53);
         y_lo = reference_ldexp(y_lo, -53);
     }
     MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo);
-    if( fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) {
+    if (fabs(y) > HEX_DBL(+, 1, 0, +, 970))
+    {
         ylog2x_hi = reference_ldexp(ylog2x_hi, 53);
         ylog2x_lo = reference_ldexp(ylog2x_lo, 53);
     }
 
     long double powxy;
-    if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) {
-        powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY;
-    } else {
+    if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200))
+    {
+        powxy =
+            reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY;
+    }
+    else
+    {
         // separate integer + fractional part
         long int m = lrint(ylog2x_hi);
         AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0);
 
         // revert to long double arithemtic
-        long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo;
-        long double tmp = reference_exp2l( ylog2x );
+        long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo;
+        long double tmp = reference_exp2l(ylog2x);
         powxy = reference_scalblnl(tmp, m);
     }
 
     // if y is odd integer and x is negative, reverse sign
-    if( isOddInt & reference_signbit(x))
-        powxy = -powxy;
+    if (isOddInt & reference_signbit(x)) powxy = -powxy;
     return powxy;
 }
 
 double reference_nextafter(double xx, double yy)
 {
-    float x = (float) xx;
-    float y = (float) yy;
+    float x = (float)xx;
+    float y = (float)yy;
 
     // take care of nans
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
-    if( x == y )
-        return y;
+    if (x == y) return y;
 
     int32f_t a, b;
 
-    a.f  = x;
-    b.f  = y;
+    a.f = x;
+    b.f = y;
 
-    if( a.i & 0x80000000 )
-        a.i = 0x80000000 - a.i;
-    if(b.i & 0x80000000 )
-        b.i = 0x80000000 - b.i;
+    if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
+    if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
 
     a.i += (a.i < b.i) ? 1 : -1;
-    a.i = (a.i < 0) ? (cl_int) 0x80000000 - a.i : a.i;
+    a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i;
 
     return a.f;
 }
@@ -4431,33 +4994,28 @@ double reference_nextafter(double xx, double yy)
 
 long double reference_nextafterl(long double xx, long double yy)
 {
-    double x = (double) xx;
-    double y = (double) yy;
+    double x = (double)xx;
+    double y = (double)yy;
 
     // take care of nans
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
     int64d_t a, b;
 
-    a.d  = x;
-    b.d  = y;
+    a.d = x;
+    b.d = y;
 
     int64_t tmp = 0x8000000000000000LL;
 
-    if( a.l & tmp )
-        a.l = tmp - a.l;
-    if(b.l & tmp )
-        b.l = tmp - b.l;
+    if (a.l & tmp) a.l = tmp - a.l;
+    if (b.l & tmp) b.l = tmp - b.l;
 
-    // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y = 0.0f)
-    // test needs to be done using integer rep because
-    // subnormals may be flushed to zero on some platforms
-    if( a.l == b.l )
-        return y;
+    // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y =
+    // 0.0f) test needs to be done using integer rep because subnormals may be
+    // flushed to zero on some platforms
+    if (a.l == b.l) return y;
 
     a.l += (a.l < b.l) ? 1 : -1;
     a.l = (a.l < 0) ? tmp - a.l : a.l;
@@ -4467,112 +5025,110 @@ long double reference_nextafterl(long double xx, long double yy)
 
 double reference_fdim(double xx, double yy)
 {
-    float x = (float) xx;
-    float y = (float) yy;
+    float x = (float)xx;
+    float y = (float)yy;
 
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
-    float r = ( x > y ) ? (float) reference_subtract( x, y) : 0.0f;
+    float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f;
     return r;
-
 }
 
 
 long double reference_fdiml(long double xx, long double yy)
 {
-    double x = (double) xx;
-    double y = (double) yy;
+    double x = (double)xx;
+    double y = (double)yy;
 
-    if( x != x )
-        return x;
+    if (x != x) return x;
 
-    if( y != y )
-        return y;
+    if (y != y) return y;
 
-    double r = ( x > y ) ? (double) reference_subtractl(x, y) : 0.0;
+    double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0;
     return r;
 }
 
 double reference_remquo(double xd, double yd, int *n)
 {
-    float xx = (float) xd;
-    float yy = (float) yd;
+    float xx = (float)xd;
+    float yy = (float)yd;
 
-    if( isnan(xx) || isnan(yy) ||
-        fabsf(xx) == INFINITY  ||
-        yy == 0.0 )
+    if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0)
     {
         *n = 0;
         return cl_make_nan();
     }
 
-    if( fabsf(yy) == INFINITY || xx == 0.0f ) {
+    if (fabsf(yy) == INFINITY || xx == 0.0f)
+    {
         *n = 0;
         return xd;
     }
 
-    if( fabsf(xx) == fabsf(yy) ) {
+    if (fabsf(xx) == fabsf(yy))
+    {
         *n = (xx == yy) ? 1 : -1;
-        return reference_signbit( xx ) ? -0.0 : 0.0;
+        return reference_signbit(xx) ? -0.0 : 0.0;
     }
 
-    int signx = reference_signbit( xx ) ? -1 : 1;
-    int signy = reference_signbit( yy ) ? -1 : 1;
+    int signx = reference_signbit(xx) ? -1 : 1;
+    int signy = reference_signbit(yy) ? -1 : 1;
     int signn = (signx == signy) ? 1 : -1;
     float x = fabsf(xx);
     float y = fabsf(yy);
 
     int ex, ey;
-    ex = reference_ilogb( x );
-    ey = reference_ilogb( y );
+    ex = reference_ilogb(x);
+    ey = reference_ilogb(y);
     float xr = x;
     float yr = y;
     uint32_t q = 0;
 
-    if(ex-ey >= -1) {
+    if (ex - ey >= -1)
+    {
 
-        yr = (float) reference_ldexp( y, -ey );
-        xr = (float) reference_ldexp( x, -ex );
+        yr = (float)reference_ldexp(y, -ey);
+        xr = (float)reference_ldexp(x, -ex);
 
-        if(ex-ey >= 0) {
+        if (ex - ey >= 0)
+        {
 
 
             int i;
-            for(i = ex-ey; i > 0; i--) {
+            for (i = ex - ey; i > 0; i--)
+            {
                 q <<= 1;
-                if(xr >= yr) {
+                if (xr >= yr)
+                {
                     xr -= yr;
                     q += 1;
                 }
                 xr += xr;
             }
             q <<= 1;
-            if( xr > yr ) {
+            if (xr > yr)
+            {
                 xr -= yr;
                 q += 1;
             }
         }
-        else //ex-ey = -1
-            xr = reference_ldexp(xr, ex-ey);
+        else // ex-ey = -1
+            xr = reference_ldexp(xr, ex - ey);
     }
 
-    if( (yr < 2.0f*xr) || ( (yr == 2.0f*xr) && (q & 0x00000001) ) ) {
+    if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001)))
+    {
         xr -= yr;
         q += 1;
     }
 
-    if(ex-ey >= -1)
-        xr = reference_ldexp(xr, ey);
+    if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
 
     int qout = q & 0x0000007f;
-    if( signn < 0)
-        qout = -qout;
-    if( xx < 0.0 )
-        xr = -xr;
+    if (signn < 0) qout = -qout;
+    if (xx < 0.0) xr = -xr;
 
     *n = qout;
 
@@ -4582,79 +5138,82 @@ double reference_remquo(double xd, double yd, int *n)
 long double reference_remquol(long double xd, long double yd, int *n)
 {
 
-    double xx = (double) xd;
-    double yy = (double) yd;
+    double xx = (double)xd;
+    double yy = (double)yd;
 
-    if( isnan(xx) || isnan(yy) ||
-        fabs(xx) == INFINITY  ||
-        yy == 0.0 )
+    if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0)
     {
         *n = 0;
         return cl_make_nan();
     }
 
-    if( reference_fabs(yy) == INFINITY || xx == 0.0 ) {
+    if (reference_fabs(yy) == INFINITY || xx == 0.0)
+    {
         *n = 0;
         return xd;
     }
 
-    if( reference_fabs(xx) == reference_fabs(yy) ) {
+    if (reference_fabs(xx) == reference_fabs(yy))
+    {
         *n = (xx == yy) ? 1 : -1;
-        return reference_signbit( xx ) ? -0.0 : 0.0;
+        return reference_signbit(xx) ? -0.0 : 0.0;
     }
 
-    int signx = reference_signbit( xx ) ? -1 : 1;
-    int signy = reference_signbit( yy ) ? -1 : 1;
+    int signx = reference_signbit(xx) ? -1 : 1;
+    int signy = reference_signbit(yy) ? -1 : 1;
     int signn = (signx == signy) ? 1 : -1;
     double x = reference_fabs(xx);
     double y = reference_fabs(yy);
 
     int ex, ey;
-    ex = reference_ilogbl( x );
-    ey = reference_ilogbl( y );
+    ex = reference_ilogbl(x);
+    ey = reference_ilogbl(y);
     double xr = x;
     double yr = y;
     uint32_t q = 0;
 
-    if(ex-ey >= -1) {
+    if (ex - ey >= -1)
+    {
 
-        yr = reference_ldexp( y, -ey );
-        xr = reference_ldexp( x, -ex );
+        yr = reference_ldexp(y, -ey);
+        xr = reference_ldexp(x, -ex);
         int i;
 
-        if(ex-ey >= 0) {
+        if (ex - ey >= 0)
+        {
 
-            for(i = ex-ey; i > 0; i--) {
+            for (i = ex - ey; i > 0; i--)
+            {
                 q <<= 1;
-                if(xr >= yr) {
+                if (xr >= yr)
+                {
                     xr -= yr;
                     q += 1;
                 }
                 xr += xr;
             }
             q <<= 1;
-            if( xr > yr ) {
+            if (xr > yr)
+            {
                 xr -= yr;
                 q += 1;
             }
         }
         else
-            xr = reference_ldexp(xr, ex-ey);
+            xr = reference_ldexp(xr, ex - ey);
     }
 
-    if( (yr < 2.0*xr) || ( (yr == 2.0*xr) && (q & 0x00000001) ) ) {
+    if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001)))
+    {
         xr -= yr;
         q += 1;
     }
 
-    if(ex-ey >= -1)
-        xr = reference_ldexp(xr, ey);
+    if (ex - ey >= -1) xr = reference_ldexp(xr, ey);
 
     int qout = q & 0x0000007f;
-    if( signn < 0)
-        qout = -qout;
-    if( xx < 0.0 )
-        xr = -xr;
+    if (signn < 0) qout = -qout;
+    if (xx < 0.0) xr = -xr;
 
     *n = qout;
     return xr;
@@ -4662,27 +5221,27 @@ long double reference_remquol(long double xd, long double yd, int *n)
 
 static double reference_scalbn(double x, int n)
 {
-    if(reference_isinf(x) || reference_isnan(x) || x == 0.0)
-        return x;
+    if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x;
 
     int bias = 1023;
-    union { double d; cl_long l; } u;
-    u.d = (double) x;
+    union {
+        double d;
+        cl_long l;
+    } u;
+    u.d = (double)x;
     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
-    if(e == 0)
+    if (e == 0)
     {
         u.l |= ((cl_long)1023 << 52);
         u.d -= 1.0;
         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
     }
     e += n;
-    if(e >= 2047 || n >= 2098 )
-        return reference_copysign(INFINITY, x);
-    if(e < -51 || n <-2097 )
-        return reference_copysign(0.0, x);
-    if(e <= 0)
+    if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x);
+    if (e < -51 || n < -2097) return reference_copysign(0.0, x);
+    if (e <= 0)
     {
-        bias += (e-1);
+        bias += (e - 1);
         e = 1;
     }
     u.l &= 0x800fffffffffffffLL;
@@ -4695,26 +5254,26 @@ static double reference_scalbn(double x, int n)
 static long double reference_scalblnl(long double x, long n)
 {
 #if defined(__i386__) || defined(__x86_64__) // INTEL
-    union
-    {
+    union {
         long double d;
-        struct{ cl_ulong m; cl_ushort sexp;}u;
-    }u;
+        struct
+        {
+            cl_ulong m;
+            cl_ushort sexp;
+        } u;
+    } u;
     u.u.m = CL_LONG_MIN;
 
-    if ( reference_isinf(x) )
-        return x;
+    if (reference_isinf(x)) return x;
 
-    if( x == 0.0L || n < -2200)
-        return reference_copysignl( 0.0L, x );
+    if (x == 0.0L || n < -2200) return reference_copysignl(0.0L, x);
 
-    if( n > 2200 )
-        return reference_copysignl( INFINITY, x );
+    if (n > 2200) return reference_copysignl(INFINITY, x);
 
-    if( n < 0 )
+    if (n < 0)
     {
         u.u.sexp = 0x3fff - 1022;
-        while( n <= -1022 )
+        while (n <= -1022)
         {
             x *= u.d;
             n += 1022;
@@ -4724,10 +5283,10 @@ static long double reference_scalblnl(long double x, long n)
         return x;
     }
 
-    if( n > 0 )
+    if (n > 0)
     {
         u.u.sexp = 0x3fff + 1023;
-        while( n >= 1023 )
+        while (n >= 1023)
         {
             x *= u.d;
             n -= 1023;
@@ -4742,27 +5301,27 @@ static long double reference_scalblnl(long double x, long n)
 #elif defined(__arm__) // ARM .. sizeof(long double) == sizeof(double)
 
 #if __DBL_MAX_EXP__ >= __LDBL_MAX_EXP__
-    if(reference_isinfl(x) || reference_isnanl(x))
-        return x;
+    if (reference_isinfl(x) || reference_isnanl(x)) return x;
 
     int bias = 1023;
-    union { double d; cl_long l; } u;
-    u.d = (double) x;
+    union {
+        double d;
+        cl_long l;
+    } u;
+    u.d = (double)x;
     int e = (int)((u.l & 0x7ff0000000000000LL) >> 52);
-    if(e == 0)
+    if (e == 0)
     {
         u.l |= ((cl_long)1023 << 52);
         u.d -= 1.0;
         e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022;
     }
     e += n;
-    if(e >= 2047)
-        return reference_copysignl(INFINITY, x);
-    if(e < -51)
-        return reference_copysignl(0.0, x);
-    if(e <= 0)
+    if (e >= 2047) return reference_copysignl(INFINITY, x);
+    if (e < -51) return reference_copysignl(0.0, x);
+    if (e <= 0)
     {
-        bias += (e-1);
+        bias += (e - 1);
         e = 1;
     }
     u.l &= 0x800fffffffffffffLL;
@@ -4772,284 +5331,259 @@ static long double reference_scalblnl(long double x, long n)
     return x * u.d;
 #endif
 
-#else  // PPC
+#else // PPC
     return scalblnl(x, n);
 #endif
 }
 
-double reference_relaxed_exp( double x )
-{
-  return reference_exp(x);
-}
+double reference_relaxed_exp(double x) { return reference_exp(x); }
 
 double reference_exp(double x)
 {
-  return reference_exp2( x * HEX_DBL( +, 1, 71547652b82fe, +, 0 ) );
+    return reference_exp2(x * HEX_DBL(+, 1, 71547652b82fe, +, 0));
 }
 
 long double reference_expl(long double x)
 {
 #if defined(__PPC__)
-  long double scale, bias;
+    long double scale, bias;
 
-  // The PPC double long version of expl fails to produce denorm results
-  // and instead generates a 0.0. Compensate for this limitation by
-  // computing expl as:
-  //     expl(x + 40) * expl(-40)
-  // Likewise, overflows can prematurely produce an infinity, so we
-  // compute expl as:
-  //     expl(x - 40) * expl(40)
-  scale = 1.0L;
-  bias = 0.0L;
-  if (x < -708.0L) {
-    bias = 40.0;
-    scale = expl(-40.0L);
-  } else if (x > 708.0L) {
-    bias = -40.0L;
-    scale = expl(40.0L);
-  }
-  return expl(x + bias) * scale;
+    // The PPC double long version of expl fails to produce denorm results
+    // and instead generates a 0.0. Compensate for this limitation by
+    // computing expl as:
+    //     expl(x + 40) * expl(-40)
+    // Likewise, overflows can prematurely produce an infinity, so we
+    // compute expl as:
+    //     expl(x - 40) * expl(40)
+    scale = 1.0L;
+    bias = 0.0L;
+    if (x < -708.0L)
+    {
+        bias = 40.0;
+        scale = expl(-40.0L);
+    }
+    else if (x > 708.0L)
+    {
+        bias = -40.0L;
+        scale = expl(40.0L);
+    }
+    return expl(x + bias) * scale;
 #else
-    return expl( x );
+    return expl(x);
 #endif
 }
 
-double reference_sinh(double x)
-{
-    return sinh(x);
-}
+double reference_sinh(double x) { return sinh(x); }
 
-long double reference_sinhl(long double x)
-{
-    return sinhl(x);
-}
+long double reference_sinhl(long double x) { return sinhl(x); }
 
 double reference_fmod(double x, double y)
 {
-    if( x == 0.0 && fabs(y) > 0.0 )
-        return x;
+    if (x == 0.0 && fabs(y) > 0.0) return x;
 
-    if( fabs(x) == INFINITY || y == 0 )
-        return cl_make_nan();
+    if (fabs(x) == INFINITY || y == 0) return cl_make_nan();
 
-    if( fabs(y) == INFINITY )    // we know x is finite from above
+    if (fabs(y) == INFINITY) // we know x is finite from above
         return x;
 #if defined(_MSC_VER) && defined(_M_X64)
-    return fmod( x, y );
+    return fmod(x, y);
 #else
-    return fmodf( (float) x, (float) y );
+    return fmodf((float)x, (float)y);
 #endif
 }
 
 long double reference_fmodl(long double x, long double y)
 {
-    if( x == 0.0L && fabsl(y) > 0.0L )
+    if (x == 0.0L && fabsl(y) > 0.0L) return x;
+
+    if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan();
+
+    if (fabsl(y) == INFINITY) // we know x is finite from above
         return x;
 
-    if( fabsl(x) == INFINITY || y == 0.0L )
-        return cl_make_nan();
-
-    if( fabsl(y) == INFINITY )    // we know x is finite from above
-        return x;
-
-    return fmod( (double) x, (double) y );
+    return fmod((double)x, (double)y);
 }
 
 double reference_modf(double x, double *n)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *n = cl_make_nan();
         return cl_make_nan();
     }
     float nr;
-    float yr = modff((float) x, &nr);
+    float yr = modff((float)x, &nr);
     *n = nr;
     return yr;
 }
 
 long double reference_modfl(long double x, long double *n)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *n = cl_make_nan();
         return cl_make_nan();
     }
     double nr;
-    double yr = modf((double) x, &nr);
+    double yr = modf((double)x, &nr);
     *n = nr;
     return yr;
 }
 
-long double reference_fractl(long double x, long double *ip )
+long double reference_fractl(long double x, long double *ip)
 {
-    if(isnan(x)) {
+    if (isnan(x))
+    {
         *ip = cl_make_nan();
         return cl_make_nan();
     }
 
     double i;
-    double f = modf((double) x, &i );
-    if( f < 0.0 )
+    double f = modf((double)x, &i);
+    if (f < 0.0)
     {
         f = 1.0 + f;
         i -= 1.0;
-        if( f == 1.0 )
-            f = HEX_DBL( +, 1, fffffffffffff, -, 1 );
+        if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1);
     }
     *ip = i;
     return f;
 }
 
-long double reference_fabsl(long double x)
-{
-    return fabsl( x );
-}
+long double reference_fabsl(long double x) { return fabsl(x); }
 
-double reference_relaxed_log( double x )
+double reference_relaxed_log(double x)
 {
-  return (float)reference_log((float)x);
+    return (float)reference_log((float)x);
 }
 
 double reference_log(double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 );
+    double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
-    return logxHi*log2Hi;
+    return logxHi * log2Hi;
 }
 
 long double reference_logl(long double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 );
-    double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 );
+    double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1);
+    double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
 
-    //double rhi, rlo;
-    //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
-    //return (long double) rhi + (long double) rlo;
+    // double rhi, rlo;
+    // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
+    // return (long double) rhi + (long double) rlo;
 
-    long double lg2 = (long double) log2Hi + (long double) log2Lo;
-    long double logx = (long double) logxHi + (long double) logxLo;
-    return logx*lg2;
+    long double lg2 = (long double)log2Hi + (long double)log2Lo;
+    long double logx = (long double)logxHi + (long double)logxLo;
+    return logx * lg2;
 }
 
-double reference_relaxed_pow( double x, double y) {
-  return (float)reference_exp2( ((float)y) * (float)reference_log2((float)x));
-}
-
-double reference_pow( double x, double y )
+double reference_relaxed_pow(double x, double y)
 {
-    static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 );
+    return (float)reference_exp2(((float)y) * (float)reference_log2((float)x));
+}
 
-    //if x = 1, return x for any y, even NaN
-    if( x == 1.0 )
-        return x;
+double reference_pow(double x, double y)
+{
+    static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53);
 
-    //if y == 0, return 1 for any x, even NaN
-    if( y == 0.0 )
-        return 1.0;
+    // if x = 1, return x for any y, even NaN
+    if (x == 1.0) return x;
 
-    //get NaNs out of the way
-    if( x != x  || y != y )
-        return x + y;
+    // if y == 0, return 1 for any x, even NaN
+    if (y == 0.0) return 1.0;
 
-    //do the work required to sort out edge cases
-    double fabsy = reference_fabs( y );
-    double fabsx = reference_fabs( x );
-    double iy = reference_rint( fabsy );            //we do round to nearest here so that |fy| <= 0.5
-    if( iy > fabsy )//convert nearbyint to floor
+    // get NaNs out of the way
+    if (x != x || y != y) return x + y;
+
+    // do the work required to sort out edge cases
+    double fabsy = reference_fabs(y);
+    double fabsx = reference_fabs(x);
+    double iy = reference_rint(
+        fabsy); // we do round to nearest here so that |fy| <= 0.5
+    if (iy > fabsy) // convert nearbyint to floor
         iy -= 1.0;
     int isOddInt = 0;
-    if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon )
-        isOddInt =     (int) (iy - 2.0 * rint( 0.5 * iy ));        //might be 0, -1, or 1
+    if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon)
+        isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1
 
-    ///test a few more edge cases
-    //deal with x == 0 cases
-    if( x == 0.0 )
+    /// test a few more edge cases
+    // deal with x == 0 cases
+    if (x == 0.0)
     {
-        if( ! isOddInt )
-            x = 0.0;
+        if (!isOddInt) x = 0.0;
 
-        if( y < 0 )
-            x = 1.0/ x;
+        if (y < 0) x = 1.0 / x;
 
         return x;
     }
 
-    //x == +-Inf cases
-    if( isinf(fabsx) )
+    // x == +-Inf cases
+    if (isinf(fabsx))
     {
-        if( x < 0 )
+        if (x < 0)
         {
-            if( isOddInt )
+            if (isOddInt)
             {
-                if( y < 0 )
+                if (y < 0)
                     return -0.0;
                 else
                     return -INFINITY;
             }
             else
             {
-                if( y < 0 )
+                if (y < 0)
                     return 0.0;
                 else
                     return INFINITY;
             }
         }
 
-        if( y < 0 )
-            return 0;
+        if (y < 0) return 0;
         return INFINITY;
     }
 
-    //y = +-inf cases
-    if( isinf(fabsy) )
+    // y = +-inf cases
+    if (isinf(fabsy))
     {
-        if( x == -1 )
-            return 1;
+        if (x == -1) return 1;
 
-        if( y < 0 )
+        if (y < 0)
         {
-            if( fabsx < 1 )
-                return INFINITY;
+            if (fabsx < 1) return INFINITY;
             return 0;
         }
-        if( fabsx < 1 )
-            return 0;
+        if (fabsx < 1) return 0;
         return INFINITY;
     }
 
     // x < 0 and y non integer case
-    if( x < 0 && iy != fabsy )
+    if (x < 0 && iy != fabsy)
     {
-        //return nan;
+        // return nan;
         return cl_make_nan();
     }
 
-    //speedy resolution of sqrt and reciprocal sqrt
-    if( fabsy == 0.5 )
+    // speedy resolution of sqrt and reciprocal sqrt
+    if (fabsy == 0.5)
     {
-        long double xl = reference_sqrt( x );
-        if( y < 0 )
-            xl = 1.0/ xl;
+        long double xl = reference_sqrt(x);
+        if (y < 0) xl = 1.0 / xl;
         return xl;
     }
 
@@ -5060,73 +5594,55 @@ double reference_pow( double x, double y )
     return isOddInt ? reference_copysignd(result, x) : result;
 }
 
-double reference_sqrt(double x)
-{
-    return sqrt(x);
-}
+double reference_sqrt(double x) { return sqrt(x); }
 
-double reference_floor(double x)
-{
-    return floorf((float) x);
-}
+double reference_floor(double x) { return floorf((float)x); }
 
 double reference_ldexp(double value, int exponent)
 {
 #ifdef __MINGW32__
-/*
- * ====================================================
- * This function is from fdlibm: http://www.netlib.org
- *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    if(!finite(value)||value==0.0) return value;
-    return scalbn(value,exponent);
+    /*
+     * ====================================================
+     * This function is from fdlibm: http://www.netlib.org
+     *   It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+     *
+     * Developed at SunSoft, a Sun Microsystems, Inc. business.
+     * Permission to use, copy, modify, and distribute this
+     * software is freely granted, provided that this notice
+     * is preserved.
+     * ====================================================
+     */
+    if (!finite(value) || value == 0.0) return value;
+    return scalbn(value, exponent);
 #else
     return reference_scalbn(value, exponent);
 #endif
 }
 
-long double reference_ldexpl(long double x, int n)
-{
-    return ldexpl( x, n);
-}
+long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); }
 
-long double reference_coshl(long double x)
-{
-    return coshl(x);
-}
+long double reference_coshl(long double x) { return coshl(x); }
 
-double reference_ceil(double x)
-{
-    return ceilf((float) x);
-}
+double reference_ceil(double x) { return ceilf((float)x); }
 
 long double reference_ceill(long double x)
 {
-    if( x == 0.0 || reference_isinfl(x) || reference_isnanl(x) )
-        return x;
+    if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
 
     long double absx = reference_fabsl(x);
-    if( absx >= HEX_LDBL( +, 1, 0, +, 52 ) )
-        return x;
+    if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
 
-    if( absx < 1.0 )
+    if (absx < 1.0)
     {
-        if( x < 0.0 )
+        if (x < 0.0)
             return 0.0;
         else
             return 1.0;
     }
 
-    long double r = (long double) ((cl_long) x);
+    long double r = (long double)((cl_long)x);
 
-    if( x > 0.0 && r < x )
-        r += 1.0;
+    if (x > 0.0 && r < x) r += 1.0;
 
     return r;
 }
@@ -5137,45 +5653,53 @@ long double reference_acosl(long double x)
     long double x2 = x * x;
     int i;
 
-    //Prepare a head + tail representation of PI in long double.  A good compiler should get rid of all of this work.
-    static const cl_ulong pi_bits[2] = { 0x3243F6A8885A308DULL, 0x313198A2E0370734ULL};  // first 126 bits of pi http://www.super-computing.org/pi-hexa_current.html
+    // Prepare a head + tail representation of PI in long double.  A good
+    // compiler should get rid of all of this work.
+    static const cl_ulong pi_bits[2] = {
+        0x3243F6A8885A308DULL, 0x313198A2E0370734ULL
+    }; // first 126 bits of pi
+       // http://www.super-computing.org/pi-hexa_current.html
     long double head, tail, temp;
 #if __LDBL_MANT_DIG__ >= 64
     // long double has 64-bits of precision or greater
-    temp = (long double) pi_bits[0] * 0x1.0p64L;
-    head = temp + (long double) pi_bits[1];
-    temp -= head;           // rounding err rounding pi_bits[1] into head
-    tail = (long double) pi_bits[1] + temp;
-    head *= HEX_LDBL( +, 1, 0, -, 125 );
-    tail *= HEX_LDBL( +, 1, 0, -, 125 );
+    temp = (long double)pi_bits[0] * 0x1.0p64L;
+    head = temp + (long double)pi_bits[1];
+    temp -= head; // rounding err rounding pi_bits[1] into head
+    tail = (long double)pi_bits[1] + temp;
+    head *= HEX_LDBL(+, 1, 0, -, 125);
+    tail *= HEX_LDBL(+, 1, 0, -, 125);
 #else
-    head = (long double) pi_bits[0];
-    tail = (long double) ((cl_long) pi_bits[0] - (cl_long) head );       // residual part of pi_bits[0] after rounding
-    tail = tail * HEX_LDBL( +, 1, 0, +, 64 ) + (long double) pi_bits[1];
-    head *= HEX_LDBL( +, 1, 0, -, 61 );
-    tail *= HEX_LDBL( +, 1, 0, -, 125 );
+    head = (long double)pi_bits[0];
+    tail =
+        (long double)((cl_long)pi_bits[0]
+                      - (cl_long)
+                          head); // residual part of pi_bits[0] after rounding
+    tail = tail * HEX_LDBL(+, 1, 0, +, 64) + (long double)pi_bits[1];
+    head *= HEX_LDBL(+, 1, 0, -, 61);
+    tail *= HEX_LDBL(+, 1, 0, -, 125);
 #endif
 
     // oversize values and NaNs go to NaN
-    if( ! (x2 <= 1.0) )
-        return sqrtl(1.0L - x2 );
+    if (!(x2 <= 1.0)) return sqrtl(1.0L - x2);
 
     //
     // deal with large |x|:
     //                                                      sqrt( 1 - x**2)
-    // acos(|x| > sqrt(0.5)) = 2 * atan( z );       z = -------------------- ;      z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
+    // acos(|x| > sqrt(0.5)) = 2 * atan( z );       z = -------------------- ;
+    // z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...]
     //                                                          1 + x
-    if( x2 > 0.5 )
+    if (x2 > 0.5)
     {
         // we handle the x < 0 case as pi - acos(|x|)
 
-        long double sign = reference_copysignl( 1.0L, x );
-        long double fabsx = reference_fabsl( x );
-        head -= head * sign;        // x > 0 ? 0 : pi.hi
-        tail -= tail * sign;        // x > 0 ? 0 : pi.low
+        long double sign = reference_copysignl(1.0L, x);
+        long double fabsx = reference_fabsl(x);
+        head -= head * sign; // x > 0 ? 0 : pi.hi
+        tail -= tail * sign; // x > 0 ? 0 : pi.low
 
-        // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt( (1-x)/(1+x) )
-        long double z2 = (1.0L - fabsx) / (1.0L + fabsx);   // z**2
+        // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt(
+        // (1-x)/(1+x) )
+        long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2
         long double z = sign * sqrtl(z2);
 
         //                     atan(sqrt(q))
@@ -5185,29 +5709,41 @@ long double reference_acosl(long double x)
         // Define q = r*r, and solve for atan(r):
         //
         //  atan(r) = (p(r) + 1) * r = rp(r) + r
-        static long double atan_coeffs[] = { HEX_LDBL( -, b, 3f52e0c278293b3, -, 67 ), HEX_LDBL( -, a, aaaaaaaaaaa95b8, -, 5 ),
-                                             HEX_LDBL( +, c, ccccccccc992407, -,  6 ), HEX_LDBL( -, 9, 24924923024398,  -, 6 ),
-                                             HEX_LDBL( +, e, 38e38d6f92c98f3, -,  7 ), HEX_LDBL( -, b, a2e89bfb8393ec6, -, 7 ),
-                                             HEX_LDBL( +, 9, d89a9f574d412cb, -,  7 ), HEX_LDBL( -, 8, 88580517884c547, -, 7 ),
-                                             HEX_LDBL( +, f, 0ab6756abdad408, -,  8 ), HEX_LDBL( -, d, 56a5b07a2f15b49, -, 8 ),
-                                             HEX_LDBL( +, b, 72ab587e46d80b2, -,  8 ), HEX_LDBL( -, 8, 62ea24bb5b2e636, -, 8 ),
-                                             HEX_LDBL( +, e, d67c16582123937, -, 10 ) }; // minimax fit over [ 0x1.0p-52, 0.18]   Max error:  0x1.67ea5c184e5d9p-64
+        static long double atan_coeffs[] = {
+            HEX_LDBL(-, b, 3f52e0c278293b3, -, 67),
+            HEX_LDBL(-, a, aaaaaaaaaaa95b8, -, 5),
+            HEX_LDBL(+, c, ccccccccc992407, -, 6),
+            HEX_LDBL(-, 9, 24924923024398, -, 6),
+            HEX_LDBL(+, e, 38e38d6f92c98f3, -, 7),
+            HEX_LDBL(-, b, a2e89bfb8393ec6, -, 7),
+            HEX_LDBL(+, 9, d89a9f574d412cb, -, 7),
+            HEX_LDBL(-, 8, 88580517884c547, -, 7),
+            HEX_LDBL(+, f, 0ab6756abdad408, -, 8),
+            HEX_LDBL(-, d, 56a5b07a2f15b49, -, 8),
+            HEX_LDBL(+, b, 72ab587e46d80b2, -, 8),
+            HEX_LDBL(-, 8, 62ea24bb5b2e636, -, 8),
+            HEX_LDBL(+, e, d67c16582123937, -, 10)
+        }; // minimax fit over [ 0x1.0p-52, 0.18]   Max error:
+           // 0x1.67ea5c184e5d9p-64
 
         // Calculate y = p(r)
-        const size_t atan_coeff_count = sizeof( atan_coeffs ) / sizeof( atan_coeffs[0] );
-        long double y = atan_coeffs[ atan_coeff_count - 1];
-        for( i = (int)atan_coeff_count - 2; i >= 0; i-- )
+        const size_t atan_coeff_count =
+            sizeof(atan_coeffs) / sizeof(atan_coeffs[0]);
+        long double y = atan_coeffs[atan_coeff_count - 1];
+        for (i = (int)atan_coeff_count - 2; i >= 0; i--)
             y = atan_coeffs[i] + y * z2;
 
-        z *= 2.0L;   // fold in 2.0 for 2.0 * atan(z)
-        y *= z;      // rp(r)
+        z *= 2.0L; // fold in 2.0 for 2.0 * atan(z)
+        y *= z; // rp(r)
 
         return head + ((y + tail) + z);
     }
 
     // do |x| <= sqrt(0.5) here
-    //                                                     acos( sqrt(z) ) - PI/2
-    //  Piecewise minimax polynomial fits for p(z) = 1 + ------------------------;
+    //                                                     acos( sqrt(z) ) -
+    //                                                     PI/2
+    //  Piecewise minimax polynomial fits for p(z) = 1 +
+    //  ------------------------;
     //                                                            sqrt(z)
     //
     //  Define z = x*x, and solve for acos(x) over x in  x >= 0:
@@ -5215,52 +5751,88 @@ long double reference_acosl(long double x)
     //      acos( sqrt(z) ) = acos(x) = x*(p(z)-1) + PI/2 = xp(x**2) - x + PI/2
     //
     const long double coeffs[4][14] = {
-                                    { HEX_LDBL( -, a, fa7382e1f347974, -, 10 ), HEX_LDBL( -, b, 4d5a992de1ac4da, -,  6 ),
-                                      HEX_LDBL( -, a, c526184bd558c17, -,  7 ), HEX_LDBL( -, d, 9ed9b0346ec092a, -,  8 ),
-                                      HEX_LDBL( -, 9, dca410c1f04b1f,  -,  8 ), HEX_LDBL( -, f, 76e411ba9581ee5, -,  9 ),
-                                      HEX_LDBL( -, c, c71b00479541d8e, -,  9 ), HEX_LDBL( -, a, f527a3f9745c9de, -,  9 ),
-                                      HEX_LDBL( -, 9, a93060051f48d14, -,  9 ), HEX_LDBL( -, 8, b3d39ad70e06021, -,  9 ),
-                                      HEX_LDBL( -, f, f2ab95ab84f79c,  -, 10 ), HEX_LDBL( -, e, d1af5f5301ccfe4, -, 10 ),
-                                      HEX_LDBL( -, e, 1b53ba562f0f74a, -, 10 ), HEX_LDBL( -, d, 6a3851330e15526, -, 10 ) },  // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]    Error: 0x1.97839bf07024p-76
+        { HEX_LDBL(-, a, fa7382e1f347974, -, 10),
+          HEX_LDBL(-, b, 4d5a992de1ac4da, -, 6),
+          HEX_LDBL(-, a, c526184bd558c17, -, 7),
+          HEX_LDBL(-, d, 9ed9b0346ec092a, -, 8),
+          HEX_LDBL(-, 9, dca410c1f04b1f, -, 8),
+          HEX_LDBL(-, f, 76e411ba9581ee5, -, 9),
+          HEX_LDBL(-, c, c71b00479541d8e, -, 9),
+          HEX_LDBL(-, a, f527a3f9745c9de, -, 9),
+          HEX_LDBL(-, 9, a93060051f48d14, -, 9),
+          HEX_LDBL(-, 8, b3d39ad70e06021, -, 9),
+          HEX_LDBL(-, f, f2ab95ab84f79c, -, 10),
+          HEX_LDBL(-, e, d1af5f5301ccfe4, -, 10),
+          HEX_LDBL(-, e, 1b53ba562f0f74a, -, 10),
+          HEX_LDBL(-, d, 6a3851330e15526, -,
+                   10) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ]
+                          // Error: 0x1.97839bf07024p-76
 
-                                    { HEX_LDBL( -, 8, c2f1d638e4c1b48, -,  8 ), HEX_LDBL( -, c, d47ac903c311c2c, -,  6 ),
-                                      HEX_LDBL( -, d, e020b2dabd5606a, -,  7 ), HEX_LDBL( -, a, 086fafac220f16b, -,  7 ),
-                                      HEX_LDBL( -, 8, 55b5efaf6b86c3e, -,  7 ), HEX_LDBL( -, f, 05c9774fed2f571, -,  8 ),
-                                      HEX_LDBL( -, e, 484a93f7f0fc772, -,  8 ), HEX_LDBL( -, e, 1a32baef01626e4, -,  8 ),
-                                      HEX_LDBL( -, e, 528e525b5c9c73d, -,  8 ), HEX_LDBL( -, e, ddd5d27ad49b2c8, -,  8 ),
-                                      HEX_LDBL( -, f, b3259e7ae10c6f,  -,  8 ), HEX_LDBL( -, 8, 68998170d5b19b7, -,  7 ),
-                                      HEX_LDBL( -, 9, 4468907f007727,  -,  7 ), HEX_LDBL( -, a, 2ad5e4906a8e7b3, -,  7 ) },// x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ]    Error: 0x1.647af70073457p-73
+        { HEX_LDBL(-, 8, c2f1d638e4c1b48, -, 8),
+          HEX_LDBL(-, c, d47ac903c311c2c, -, 6),
+          HEX_LDBL(-, d, e020b2dabd5606a, -, 7),
+          HEX_LDBL(-, a, 086fafac220f16b, -, 7),
+          HEX_LDBL(-, 8, 55b5efaf6b86c3e, -, 7),
+          HEX_LDBL(-, f, 05c9774fed2f571, -, 8),
+          HEX_LDBL(-, e, 484a93f7f0fc772, -, 8),
+          HEX_LDBL(-, e, 1a32baef01626e4, -, 8),
+          HEX_LDBL(-, e, 528e525b5c9c73d, -, 8),
+          HEX_LDBL(-, e, ddd5d27ad49b2c8, -, 8),
+          HEX_LDBL(-, f, b3259e7ae10c6f, -, 8),
+          HEX_LDBL(-, 8, 68998170d5b19b7, -, 7),
+          HEX_LDBL(-, 9, 4468907f007727, -, 7),
+          HEX_LDBL(-, a, 2ad5e4906a8e7b3, -,
+                   7) }, // x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ]    Error:
+                         // 0x1.647af70073457p-73
 
-                                    { HEX_LDBL( -, f, a76585ad399e7ac, -,  8 ), HEX_LDBL( -, e, d665b7dd504ca7c, -,  6 ),
-                                      HEX_LDBL( -, 9, 4c7c2402bd4bc33, -,  6 ), HEX_LDBL( -, f, ba76b69074ff71c, -,  7 ),
-                                      HEX_LDBL( -, f, 58117784bdb6d5f, -,  7 ), HEX_LDBL( -, 8, 22ddd8eef53227d, -,  6 ),
-                                      HEX_LDBL( -, 9, 1d1d3b57a63cdb4, -,  6 ), HEX_LDBL( -, a, 9c4bdc40cca848,  -,  6 ),
-                                      HEX_LDBL( -, c, b673b12794edb24, -,  6 ), HEX_LDBL( -, f, 9290a06e31575bf, -,  6 ),
-                                      HEX_LDBL( -, 9, b4929c16aeb3d1f, -,  5 ), HEX_LDBL( -, c, 461e725765a7581, -,  5 ),
-                                      HEX_LDBL( -, 8, 0a59654c98d9207, -,  4 ), HEX_LDBL( -, a, 6de6cbd96c80562, -,  4 ) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ]   Error: 0x1.b0246c304ce1ap-70
+        { HEX_LDBL(-, f, a76585ad399e7ac, -, 8),
+          HEX_LDBL(-, e, d665b7dd504ca7c, -, 6),
+          HEX_LDBL(-, 9, 4c7c2402bd4bc33, -, 6),
+          HEX_LDBL(-, f, ba76b69074ff71c, -, 7),
+          HEX_LDBL(-, f, 58117784bdb6d5f, -, 7),
+          HEX_LDBL(-, 8, 22ddd8eef53227d, -, 6),
+          HEX_LDBL(-, 9, 1d1d3b57a63cdb4, -, 6),
+          HEX_LDBL(-, a, 9c4bdc40cca848, -, 6),
+          HEX_LDBL(-, c, b673b12794edb24, -, 6),
+          HEX_LDBL(-, f, 9290a06e31575bf, -, 6),
+          HEX_LDBL(-, 9, b4929c16aeb3d1f, -, 5),
+          HEX_LDBL(-, c, 461e725765a7581, -, 5),
+          HEX_LDBL(-, 8, 0a59654c98d9207, -, 4),
+          HEX_LDBL(-, a, 6de6cbd96c80562, -,
+                   4) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ]   Error:
+                         // 0x1.b0246c304ce1ap-70
 
-                                    { HEX_LDBL( -, b, dca8b0359f96342, -,  7 ), HEX_LDBL( -, 8, cd2522fcde9823,  -,  5 ),
-                                      HEX_LDBL( -, d, 2af9397b27ff74d, -,  6 ), HEX_LDBL( -, d, 723f2c2c2409811, -,  6 ),
-                                      HEX_LDBL( -, f, ea8f8481ecc3cd1, -,  6 ), HEX_LDBL( -, a, 43fd8a7a646b0b2, -,  5 ),
-                                      HEX_LDBL( -, e, 01b0bf63a4e8d76, -,  5 ), HEX_LDBL( -, 9, f0b7096a2a7b4d,  -,  4 ),
-                                      HEX_LDBL( -, e, 872e7c5a627ab4c, -,  4 ), HEX_LDBL( -, a, dbd760a1882da48, -,  3 ),
-                                      HEX_LDBL( -, 8, 424e4dea31dd273, -,  2 ), HEX_LDBL( -, c, c05d7730963e793, -,  2 ),
-                                      HEX_LDBL( -, a, 523d97197cd124a, -,  1 ), HEX_LDBL( -, 8, 307ba943978aaee, +,  0 ) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ]  Error: 0x1.9ecff73da69c9p-66
-                                 };
+        { HEX_LDBL(-, b, dca8b0359f96342, -, 7),
+          HEX_LDBL(-, 8, cd2522fcde9823, -, 5),
+          HEX_LDBL(-, d, 2af9397b27ff74d, -, 6),
+          HEX_LDBL(-, d, 723f2c2c2409811, -, 6),
+          HEX_LDBL(-, f, ea8f8481ecc3cd1, -, 6),
+          HEX_LDBL(-, a, 43fd8a7a646b0b2, -, 5),
+          HEX_LDBL(-, e, 01b0bf63a4e8d76, -, 5),
+          HEX_LDBL(-, 9, f0b7096a2a7b4d, -, 4),
+          HEX_LDBL(-, e, 872e7c5a627ab4c, -, 4),
+          HEX_LDBL(-, a, dbd760a1882da48, -, 3),
+          HEX_LDBL(-, 8, 424e4dea31dd273, -, 2),
+          HEX_LDBL(-, c, c05d7730963e793, -, 2),
+          HEX_LDBL(-, a, 523d97197cd124a, -, 1),
+          HEX_LDBL(-, 8, 307ba943978aaee, +,
+                   0) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ]  Error:
+                        // 0x1.9ecff73da69c9p-66
+    };
 
     const long double offsets[4] = { 0.0625, 0.1875, 0.3125, 0.4375 };
-    const size_t coeff_count = sizeof( coeffs[0] ) / sizeof( coeffs[0][0] );
+    const size_t coeff_count = sizeof(coeffs[0]) / sizeof(coeffs[0][0]);
 
-    // reduce the incoming values a bit so that they are in the range [-0x1.0p-4, 0x1.0p-4]
+    // reduce the incoming values a bit so that they are in the range
+    // [-0x1.0p-4, 0x1.0p-4]
     const long double *c;
     i = x2 * 8.0L;
     c = coeffs[i];
-    x2 -= offsets[i];       // exact
+    x2 -= offsets[i]; // exact
 
     // calcualte p(x2)
-    long double y = c[ coeff_count - 1];
-    for( i = (int)coeff_count - 2; i >= 0; i-- )
-        y = c[i] + y * x2;
+    long double y = c[coeff_count - 1];
+    for (i = (int)coeff_count - 2; i >= 0; i--) y = c[i] + y * x2;
 
     // xp(x2)
     y *= x;
@@ -5273,58 +5845,50 @@ double reference_relaxed_acos(double x) { return reference_acos(x); }
 
 double reference_log10(double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 );
+    double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
-    return logxHi*log2Hi;
+    return logxHi * log2Hi;
 }
 
 double reference_relaxed_log10(double x) { return reference_log10(x); }
 
 long double reference_log10l(long double x)
 {
-    if( x == 0.0 )
-        return -INFINITY;
+    if (x == 0.0) return -INFINITY;
 
-    if( x < 0.0 )
-        return cl_make_nan();
+    if (x < 0.0) return cl_make_nan();
 
-    if( isinf(x) )
-        return INFINITY;
+    if (isinf(x)) return INFINITY;
 
-    double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 );
-    double log2Lo = HEX_DBL( +, 1, e623e2566b02d, -, 55 );
+    double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2);
+    double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55);
     double logxHi, logxLo;
     __log2_ep(&logxHi, &logxLo, x);
 
-    //double rhi, rlo;
-    //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
-    //return (long double) rhi + (long double) rlo;
+    // double rhi, rlo;
+    // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo);
+    // return (long double) rhi + (long double) rlo;
 
-    long double lg2 = (long double) log2Hi + (long double) log2Lo;
-    long double logx = (long double) logxHi + (long double) logxLo;
-    return logx*lg2;
+    long double lg2 = (long double)log2Hi + (long double)log2Lo;
+    long double logx = (long double)logxHi + (long double)logxLo;
+    return logx * lg2;
 }
 
-double reference_acos(double x)
-{
-    return acos( x );
-}
+double reference_acos(double x) { return acos(x); }
 
 double reference_atan2(double x, double y)
 {
 #if defined(_WIN32)
     // fix edge cases for Windows
-    if (isinf(x) && isinf(y)) {
+    if (isinf(x) && isinf(y))
+    {
         double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
         return (x > 0) ? retval : -retval;
     }
@@ -5336,7 +5900,8 @@ long double reference_atan2l(long double x, long double y)
 {
 #if defined(_WIN32)
     // fix edge cases for Windows
-    if (isinf(x) && isinf(y)) {
+    if (isinf(x) && isinf(y))
+    {
         long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4;
         return (x > 0) ? retval : -retval;
     }
@@ -5346,7 +5911,7 @@ long double reference_atan2l(long double x, long double y)
 
 double reference_frexp(double a, int *exp)
 {
-    if(isnan(a) || isinf(a) || a == 0.0)
+    if (isnan(a) || isinf(a) || a == 0.0)
     {
         *exp = 0;
         return a;
@@ -5364,7 +5929,7 @@ double reference_frexp(double a, int *exp)
     u.l &= 0x7fffffffffffffffULL;
     int bias = -1022;
 
-    if((u.l & 0x7ff0000000000000ULL) == 0)
+    if ((u.l & 0x7ff0000000000000ULL) == 0)
     {
         double d = u.l;
         u.d = d;
@@ -5383,13 +5948,13 @@ double reference_frexp(double a, int *exp)
 
 long double reference_frexpl(long double a, int *exp)
 {
-    if(isnan(a) || isinf(a) || a == 0.0)
+    if (isnan(a) || isinf(a) || a == 0.0)
     {
         *exp = 0;
         return a;
     }
 
-    if(sizeof(long double) == sizeof(double))
+    if (sizeof(long double) == sizeof(double))
     {
         return reference_frexp(a, exp);
     }
@@ -5400,92 +5965,64 @@ long double reference_frexpl(long double a, int *exp)
 }
 
 
-double reference_atan(double x)
-{
-    return atan( x );
-}
+double reference_atan(double x) { return atan(x); }
 
-long double reference_atanl(long double x)
-{
-    return atanl( x );
-}
+long double reference_atanl(long double x) { return atanl(x); }
 
-long double reference_asinl(long double x)
-{
-    return asinl( x );
-}
+long double reference_asinl(long double x) { return asinl(x); }
 
-double reference_asin(double x)
-{
-    return asin( x );
-}
+double reference_asin(double x) { return asin(x); }
 
 double reference_relaxed_asin(double x) { return reference_asin(x); }
 
-double reference_fabs(double x)
-{
-    return fabs( x);
-}
+double reference_fabs(double x) { return fabs(x); }
 
-double reference_cosh(double x)
-{
-    return cosh( x );
-}
+double reference_cosh(double x) { return cosh(x); }
 
 long double reference_sqrtl(long double x)
 {
-#if defined( __SSE2__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64)))
-    __m128d result128 = _mm_set_sd((double) x);
+#if defined(__SSE2__)                                                          \
+    || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
+    __m128d result128 = _mm_set_sd((double)x);
     result128 = _mm_sqrt_sd(result128, result128);
     return _mm_cvtsd_f64(result128);
 #else
     volatile double dx = x;
-    return sqrt( dx );
+    return sqrt(dx);
 #endif
 }
 
-long double reference_tanhl(long double x)
-{
-    return tanhl( x );
-}
+long double reference_tanhl(long double x) { return tanhl(x); }
 
 long double reference_floorl(long double x)
 {
-    if( x == 0.0 || reference_isinfl(x) || reference_isnanl(x) )
-        return x;
+    if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x;
 
     long double absx = reference_fabsl(x);
-    if( absx >= HEX_LDBL( +, 1, 0, +, 52 ) )
-        return x;
+    if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x;
 
-    if( absx < 1.0 )
+    if (absx < 1.0)
     {
-        if( x < 0.0 )
+        if (x < 0.0)
             return -1.0;
         else
             return 0.0;
     }
 
-    long double r = (long double) ((cl_long) x);
+    long double r = (long double)((cl_long)x);
 
-    if( x < 0.0 && r > x )
-        r -= 1.0;
+    if (x < 0.0 && r > x) r -= 1.0;
 
     return r;
 }
 
 
-double reference_tanh(double x)
-{
-    return tanh( x );
-}
+double reference_tanh(double x) { return tanh(x); }
 
-long double reference_assignmentl( long double x ){ return x; }
+long double reference_assignmentl(long double x) { return x; }
 
-int reference_notl( long double x )
+int reference_notl(long double x)
 {
     int r = !x;
     return r;
 }
-
-
diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h
index 7c751f68..78b24510 100644
--- a/test_conformance/math_brute_force/reference_math.h
+++ b/test_conformance/math_brute_force/reference_math.h
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -16,223 +16,221 @@
 #ifndef REFERENCE_MATH_H
 #define REFERENCE_MATH_H
 
-#if defined( __APPLE__ )
-    #include <OpenCL/opencl.h>
+#if defined(__APPLE__)
+#include <OpenCL/opencl.h>
 #else
-    #include <CL/cl.h>
+#include <CL/cl.h>
 #endif
 
 // --  for testing float --
-double reference_sinh( double x );
-double reference_sqrt( double x );
-double reference_tanh( double x );
-double reference_acos( double );
-double reference_asin( double );
-double reference_atan( double );
-double reference_atan2( double, double );
-double reference_ceil( double );
-double reference_cosh( double );
-double reference_exp( double );
-double reference_fabs( double );
-double reference_acospi( double );
-double reference_asinpi( double );
-double reference_atanpi( double );
-double reference_atan2pi( double, double );
-double reference_cospi( double );
-double reference_divide( double, double );
-double reference_fract( double, double * );
-float  reference_fma( float, float, float, int );
-double reference_mad( double, double, double );
-double reference_nextafter(double, double );
-double reference_recip( double );
-double reference_rootn( double, int );
-double reference_rsqrt( double );
-double reference_sincos( double, double * );
-double reference_sinpi( double );
-double reference_tanpi( double );
+double reference_sinh(double x);
+double reference_sqrt(double x);
+double reference_tanh(double x);
+double reference_acos(double);
+double reference_asin(double);
+double reference_atan(double);
+double reference_atan2(double, double);
+double reference_ceil(double);
+double reference_cosh(double);
+double reference_exp(double);
+double reference_fabs(double);
+double reference_acospi(double);
+double reference_asinpi(double);
+double reference_atanpi(double);
+double reference_atan2pi(double, double);
+double reference_cospi(double);
+double reference_divide(double, double);
+double reference_fract(double, double*);
+float reference_fma(float, float, float, int);
+double reference_mad(double, double, double);
+double reference_nextafter(double, double);
+double reference_recip(double);
+double reference_rootn(double, int);
+double reference_rsqrt(double);
+double reference_sincos(double, double*);
+double reference_sinpi(double);
+double reference_tanpi(double);
 double reference_pow(double x, double y);
-double reference_pown( double, int );
-double reference_powr( double, double );
-double reference_cos( double );
-double reference_sin( double );
-double reference_tan( double );
-double reference_log( double );
-double reference_log10( double );
-double reference_modf( double, double *n );
+double reference_pown(double, int);
+double reference_powr(double, double);
+double reference_cos(double);
+double reference_sin(double);
+double reference_tan(double);
+double reference_log(double);
+double reference_log10(double);
+double reference_modf(double, double* n);
 
-double reference_fdim( double, double );
-double reference_add( double, double );
-double reference_subtract( double, double );
-double reference_divide( double, double );
-double reference_multiply( double, double );
-double reference_remquo( double, double, int* );
-double reference_lgamma_r( double, int* );
+double reference_fdim(double, double);
+double reference_add(double, double);
+double reference_subtract(double, double);
+double reference_divide(double, double);
+double reference_multiply(double, double);
+double reference_remquo(double, double, int*);
+double reference_lgamma_r(double, int*);
 
-int reference_isequal( double, double );
-int reference_isfinite( double );
-int reference_isgreater( double, double );
-int reference_isgreaterequal( double, double );
-int reference_isinf( double );
-int reference_isless( double, double );
-int reference_islessequal( double, double );
-int reference_islessgreater( double, double );
-int reference_isnan( double );
-int reference_isnormal( double );
-int reference_isnotequal( double, double );
-int reference_isordered( double, double );
-int reference_isunordered( double, double );
-int reference_signbit( float );
+int reference_isequal(double, double);
+int reference_isfinite(double);
+int reference_isgreater(double, double);
+int reference_isgreaterequal(double, double);
+int reference_isinf(double);
+int reference_isless(double, double);
+int reference_islessequal(double, double);
+int reference_islessgreater(double, double);
+int reference_isnan(double);
+int reference_isnormal(double);
+int reference_isnotequal(double, double);
+int reference_isordered(double, double);
+int reference_isunordered(double, double);
+int reference_signbit(float);
 
-double reference_acosh( double x );
-double reference_asinh( double x );
-double reference_atanh( double x );
+double reference_acosh(double x);
+double reference_asinh(double x);
+double reference_atanh(double x);
 double reference_cbrt(double x);
-float reference_copysign( float x, float y);
-double reference_copysignd( double x, double y);
-double reference_exp10( double );
-double reference_exp2( double x );
-double reference_expm1( double x );
-double reference_fmax( double x, double y );
-double reference_fmin( double x, double y );
-double reference_hypot( double x, double y );
-double reference_lgamma( double x);
-int    reference_ilogb( double );
-double reference_log2( double x );
-double reference_log1p( double x );
-double reference_logb( double x );
-double reference_maxmag( double x, double y );
-double reference_minmag( double x, double y );
-double reference_nan( cl_uint x );
-double reference_reciprocal( double x );
-double reference_remainder( double x, double y );
-double reference_rint( double x );
-double reference_round( double x );
-double reference_trunc( double x );
-double reference_floor( double x );
-double reference_fmod( double x, double y );
-double reference_frexp( double x, int *n );
-double reference_ldexp( double x, int n );
+float reference_copysign(float x, float y);
+double reference_copysignd(double x, double y);
+double reference_exp10(double);
+double reference_exp2(double x);
+double reference_expm1(double x);
+double reference_fmax(double x, double y);
+double reference_fmin(double x, double y);
+double reference_hypot(double x, double y);
+double reference_lgamma(double x);
+int reference_ilogb(double);
+double reference_log2(double x);
+double reference_log1p(double x);
+double reference_logb(double x);
+double reference_maxmag(double x, double y);
+double reference_minmag(double x, double y);
+double reference_nan(cl_uint x);
+double reference_reciprocal(double x);
+double reference_remainder(double x, double y);
+double reference_rint(double x);
+double reference_round(double x);
+double reference_trunc(double x);
+double reference_floor(double x);
+double reference_fmod(double x, double y);
+double reference_frexp(double x, int* n);
+double reference_ldexp(double x, int n);
 
-double reference_assignment( double x );
-int    reference_not( double x );
+double reference_assignment(double x);
+int reference_not(double x);
 // -- for testing fast-relaxed
 
 double reference_relaxed_acos(double);
 double reference_relaxed_asin(double);
 double reference_relaxed_atan(double);
-double reference_relaxed_mad( double, double, double );
-double reference_relaxed_divide( double x, double y );
-double reference_relaxed_sin( double x );
+double reference_relaxed_mad(double, double, double);
+double reference_relaxed_divide(double x, double y);
+double reference_relaxed_sin(double x);
 double reference_relaxed_sinpi(double x);
-double reference_relaxed_cos( double x );
+double reference_relaxed_cos(double x);
 double reference_relaxed_cospi(double x);
-double reference_relaxed_sincos( double x, double * y);
-double reference_relaxed_tan( double x );
-double reference_relaxed_exp( double x );
-double reference_relaxed_exp2( double x );
-double reference_relaxed_exp10( double x );
-double reference_relaxed_log( double x );
-double reference_relaxed_log2( double x );
+double reference_relaxed_sincos(double x, double* y);
+double reference_relaxed_tan(double x);
+double reference_relaxed_exp(double x);
+double reference_relaxed_exp2(double x);
+double reference_relaxed_exp10(double x);
+double reference_relaxed_log(double x);
+double reference_relaxed_log2(double x);
 double reference_relaxed_log10(double x);
-double reference_relaxed_pow( double x, double y);
-double reference_relaxed_reciprocal( double x );
+double reference_relaxed_pow(double x, double y);
+double reference_relaxed_reciprocal(double x);
 
 // -- for testing double --
 
-long double reference_sinhl( long double x );
-long double reference_sqrtl( long double x );
-long double reference_tanhl( long double x );
-long double reference_acosl( long double );
-long double reference_asinl( long double );
-long double reference_atanl( long double );
-long double reference_atan2l( long double, long double );
-long double reference_ceill( long double );
-long double reference_coshl( long double );
-long double reference_expl( long double );
-long double reference_fabsl( long double );
-long double reference_acospil( long double );
-long double reference_asinpil( long double );
-long double reference_atanpil( long double );
-long double reference_atan2pil( long double, long double );
-long double reference_cospil( long double );
-long double reference_dividel( long double, long double );
-long double reference_fractl( long double, long double * );
-long double reference_fmal( long double, long double, long double );
-long double reference_madl( long double, long double, long double );
-long double reference_nextafterl(long double, long double );
-long double reference_recipl( long double );
-long double reference_rootnl( long double, int );
-long double reference_rsqrtl( long double );
-long double reference_sincosl( long double, long double * );
-long double reference_sinpil( long double );
-long double reference_tanpil( long double );
+long double reference_sinhl(long double x);
+long double reference_sqrtl(long double x);
+long double reference_tanhl(long double x);
+long double reference_acosl(long double);
+long double reference_asinl(long double);
+long double reference_atanl(long double);
+long double reference_atan2l(long double, long double);
+long double reference_ceill(long double);
+long double reference_coshl(long double);
+long double reference_expl(long double);
+long double reference_fabsl(long double);
+long double reference_acospil(long double);
+long double reference_asinpil(long double);
+long double reference_atanpil(long double);
+long double reference_atan2pil(long double, long double);
+long double reference_cospil(long double);
+long double reference_dividel(long double, long double);
+long double reference_fractl(long double, long double*);
+long double reference_fmal(long double, long double, long double);
+long double reference_madl(long double, long double, long double);
+long double reference_nextafterl(long double, long double);
+long double reference_recipl(long double);
+long double reference_rootnl(long double, int);
+long double reference_rsqrtl(long double);
+long double reference_sincosl(long double, long double*);
+long double reference_sinpil(long double);
+long double reference_tanpil(long double);
 long double reference_powl(long double x, long double y);
-long double reference_pownl( long double, int );
-long double reference_powrl( long double, long double );
-long double reference_cosl( long double );
-long double reference_sinl(long double );
-long double reference_tanl( long double );
-long double reference_logl( long double );
-long double reference_log10l( long double );
-long double reference_modfl( long double, long double *n );
+long double reference_pownl(long double, int);
+long double reference_powrl(long double, long double);
+long double reference_cosl(long double);
+long double reference_sinl(long double);
+long double reference_tanl(long double);
+long double reference_logl(long double);
+long double reference_log10l(long double);
+long double reference_modfl(long double, long double* n);
 
 
-long double reference_fdiml( long double, long double );
-long double reference_addl( long double, long double );
-long double reference_subtractl( long double, long double );
-long double reference_dividel( long double, long double );
-long double reference_multiplyl( long double, long double );
-long double reference_remquol( long double, long double, int* );
-long double reference_lgamma_rl( long double, int* );
+long double reference_fdiml(long double, long double);
+long double reference_addl(long double, long double);
+long double reference_subtractl(long double, long double);
+long double reference_dividel(long double, long double);
+long double reference_multiplyl(long double, long double);
+long double reference_remquol(long double, long double, int*);
+long double reference_lgamma_rl(long double, int*);
 
 
-int reference_isequall( long double, long double );
-int reference_isfinitel( long double );
-int reference_isgreaterl( long double, long double );
-int reference_isgreaterequall( long double, long double );
-int reference_isinfl( long double );
-int reference_islessl( long double, long double );
-int reference_islessequall( long double, long double );
-int reference_islessgreaterl( long double, long double );
-int reference_isnanl( long double );
-int reference_isnormall( long double );
-int reference_isnotequall( long double, long double );
-int reference_isorderedl( long double, long double );
-int reference_isunorderedl( long double, long double );
-int reference_signbitl( long double );
+int reference_isequall(long double, long double);
+int reference_isfinitel(long double);
+int reference_isgreaterl(long double, long double);
+int reference_isgreaterequall(long double, long double);
+int reference_isinfl(long double);
+int reference_islessl(long double, long double);
+int reference_islessequall(long double, long double);
+int reference_islessgreaterl(long double, long double);
+int reference_isnanl(long double);
+int reference_isnormall(long double);
+int reference_isnotequall(long double, long double);
+int reference_isorderedl(long double, long double);
+int reference_isunorderedl(long double, long double);
+int reference_signbitl(long double);
 
-long double reference_acoshl( long double x );
-long double reference_asinhl( long double x );
-long double reference_atanhl( long double x );
+long double reference_acoshl(long double x);
+long double reference_asinhl(long double x);
+long double reference_atanhl(long double x);
 long double reference_cbrtl(long double x);
-long double reference_copysignl( long double x, long double y);
-long double reference_exp10l( long double );
-long double reference_exp2l( long double x );
-long double reference_expm1l( long double x );
-long double reference_fmaxl( long double x, long double y );
-long double reference_fminl( long double x, long double y );
-long double reference_hypotl( long double x, long double y );
-long double reference_lgammal( long double x);
-int    reference_ilogbl( long double );
-long double reference_log2l( long double x );
-long double reference_log1pl( long double x );
-long double reference_logbl( long double x );
-long double reference_maxmagl( long double x, long double y );
-long double reference_minmagl( long double x, long double y );
-long double reference_nanl( cl_ulong x );
-long double reference_reciprocall( long double x );
-long double reference_remainderl( long double x, long double y );
-long double reference_rintl( long double x );
-long double reference_roundl( long double x );
-long double reference_truncl( long double x );
-long double reference_floorl( long double x );
-long double reference_fmodl( long double x, long double y );
-long double reference_frexpl( long double x, int *n );
-long double reference_ldexpl( long double x, int n );
+long double reference_copysignl(long double x, long double y);
+long double reference_exp10l(long double);
+long double reference_exp2l(long double x);
+long double reference_expm1l(long double x);
+long double reference_fmaxl(long double x, long double y);
+long double reference_fminl(long double x, long double y);
+long double reference_hypotl(long double x, long double y);
+long double reference_lgammal(long double x);
+int reference_ilogbl(long double);
+long double reference_log2l(long double x);
+long double reference_log1pl(long double x);
+long double reference_logbl(long double x);
+long double reference_maxmagl(long double x, long double y);
+long double reference_minmagl(long double x, long double y);
+long double reference_nanl(cl_ulong x);
+long double reference_reciprocall(long double x);
+long double reference_remainderl(long double x, long double y);
+long double reference_rintl(long double x);
+long double reference_roundl(long double x);
+long double reference_truncl(long double x);
+long double reference_floorl(long double x);
+long double reference_fmodl(long double x, long double y);
+long double reference_frexpl(long double x, int* n);
+long double reference_ldexpl(long double x, int n);
 
-long double reference_assignmentl( long double x );
-int reference_notl( long double x );
+long double reference_assignmentl(long double x);
+int reference_notl(long double x);
 
 #endif
-
-
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index fd97a95d..448a7c3d 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -35,15 +35,29 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2,  __global float", sizeNames[vectorSize], "* in3 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-        "}\n"
-    };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2,  __global float",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2 , __global float* in3)\n"
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2 , "
+        "__global float* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
@@ -51,12 +65,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       float3 f0 = vload3( 0, in + 3 * i );\n"
         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
         "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-        "       f0 = ", name, "( f0, f1, f2 );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
         "       vstore3( f0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       float3 f0, f1, f2;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -71,7 +89,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       f0 = ", name, "( f0, f1, f2 );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -86,16 +106,17 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -103,17 +124,31 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2,  __global double", sizeNames[vectorSize], "* in3 )\n"
-        "{\n"
-        "   int i = get_global_id(0);\n"
-        "   out[i] = ", name, "( in1[i], in2[i], in3[i] );\n"
-        "}\n"
-    };
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2,  __global double",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2 , __global double* in3)\n"
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2 , "
+        "__global double* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
         "   if( i + 1 < get_global_size(0) )\n"
@@ -121,12 +156,16 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       double3 d0 = vload3( 0, in + 3 * i );\n"
         "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
         "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-        "       d0 = ", name, "( d0, d1, d2 );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
         "       vstore3( d0, 0, out + 3*i );\n"
         "   }\n"
         "   else\n"
         "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
         "       double3 d0, d1, d2;\n"
         "       switch( parity )\n"
         "       {\n"
@@ -141,7 +180,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       d0 = ", name, "( d0, d1, d2 );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 0:\n"
@@ -156,42 +197,47 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
     };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -200,18 +246,85 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo
 
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),  MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.75f, -1.5f, -1.25f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24), -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.75f,
+    -1.5f,
+    -1.25f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24),
+    -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
 
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.75f, 1.5f, 1.25f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24), +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.75f,
+    1.5f,
+    1.25f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24),
+    +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
 };
 
-static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] );
+static size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
@@ -219,23 +332,23 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int skipNanInf = (0 == strcmp( "fma", f->nameInCode )) && ! gInfNanSupport;
-    cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
+    int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
+    cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
     float float_ulps;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded )
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -243,469 +356,570 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
     /*
      for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-     if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-     return error;
+     if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs +
+     i) ) ) return error;
      */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
         j = 0;
-        if( i == 0 )
+        if (i == 0)
         { // test edge cases
             float *fp = (float *)gIn;
             float *fp2 = (float *)gIn2;
             float *fp3 = (float *)gIn3;
-            uint32_t x, y, z;  x = y = z = 0;
-            for( ; j < bufferSize / sizeof( float ); j++ )
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; j < bufferSize / sizeof(float); j++)
             {
                 fp[j] = specialValuesFloat[x];
                 fp2[j] = specialValuesFloat[y];
                 fp3[j] = specialValuesFloat[z];
 
-                if( ++x >= specialValuesFloatCount )
+                if (++x >= specialValuesFloatCount)
                 {
                     x = 0;
-                    if( ++y >= specialValuesFloatCount )
+                    if (++y >= specialValuesFloatCount)
                     {
                         y = 0;
-                        if( ++z >= specialValuesFloatCount )
-                            break;
+                        if (++z >= specialValuesFloatCount) break;
                     }
                 }
             }
-            if( j == bufferSize / sizeof( float ) )
-                vlog_error( "Test Error: not all special cases tested!\n" );
+            if (j == bufferSize / sizeof(float))
+                vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for( ; j < bufferSize / sizeof( float ); j++ )
+        for (; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         float *s = (float *)gIn;
         float *s2 = (float *)gIn2;
         float *s3 = (float *)gIn3;
-        if( skipNanInf )
+        if (skipNanInf)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
                 feclearexcept(FE_OVERFLOW);
-                r[j] = (float) f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED );
-                overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+                r[j] =
+                    (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
             }
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                r[j] = (float) f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED );
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                r[j] =
+                    (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
         }
 
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
                     float err;
                     int fail;
-                    float test = ((float*) q)[j];
-                    float correct = f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED );
+                    float test = ((float *)q)[j];
+                    float correct =
+                        f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
 
-                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                    if( skipNanInf )
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
+                    if (skipNanInf)
                     {
-                        if( overflow[j]                                         ||
-                           IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                           IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        ||
-                           IsFloatInfinity(s2[j])   || IsFloatNaN(s2[j])       ||
-                           IsFloatInfinity(s3[j])   || IsFloatNaN(s3[j])       )
+                        if (overflow[j] || IsFloatInfinity(correct)
+                            || IsFloatNaN(correct) || IsFloatInfinity(s[j])
+                            || IsFloatNaN(s[j]) || IsFloatInfinity(s2[j])
+                            || IsFloatNaN(s2[j]) || IsFloatInfinity(s3[j])
+                            || IsFloatNaN(s3[j]))
                             continue;
                     }
 
 
-                    err = Ulp_Error( test, correct );
-                    fail = ! (fabsf(err) <= float_ulps);
+                    err = Ulp_Error(test, correct);
+                    fail = !(fabsf(err) <= float_ulps);
 
-                    if( fail && ftz )
+                    if (fail && ftz)
                     {
                         float correct2, err2;
 
                         // retry per section 6.5.3.2  with flushing on
-                        if( 0.0f == test && 0.0f == f->func.f_fma( s[j], s2[j], s3[j], FLUSHED ) )
+                        if (0.0f == test
+                            && 0.0f
+                                == f->func.f_fma(s[j], s2[j], s3[j], FLUSHED))
                         {
                             fail = 0;
                             err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( fail && IsFloatSubnormal( s[j] ) )
+                        if (fail && IsFloatSubnormal(s[j]))
                         { // look at me,
                             float err3, correct3;
 
-                            if( skipNanInf )
-                                feclearexcept( FE_OVERFLOW );
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = f->func.f_fma( 0.0f, s2[j], s3[j], CORRECTLY_ROUNDED );
-                            correct3 = f->func.f_fma( -0.0f, s2[j], s3[j], CORRECTLY_ROUNDED );
+                            correct2 = f->func.f_fma(0.0f, s2[j], s3[j],
+                                                     CORRECTLY_ROUNDED);
+                            correct3 = f->func.f_fma(-0.0f, s2[j], s3[j],
+                                                     CORRECTLY_ROUNDED);
 
-                            if( skipNanInf )
+                            if (skipNanInf)
                             {
-                                if( fetestexcept( FE_OVERFLOW ) )
-                                    continue;
+                                if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                   IsFloatInfinity(correct3) || IsFloatNaN(correct3)   )
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps))
+                                    && (!(fabsf(err3) <= float_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( 0.0f == test &&
-                                ( 0.0f == f->func.f_fma(  0.0f, s2[j], s3[j], FLUSHED )  ||
-                                  0.0f == f->func.f_fma( -0.0f, s2[j], s3[j], FLUSHED ) )
-                              )
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(0.0f, s2[j], s3[j],
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(-0.0f, s2[j], s3[j],
+                                                         FLUSHED)))
                             {
                                 fail = 0;
                                 err = 0.0f;
                             }
 
-                            //try with first two args as zero
-                            if( IsFloatSubnormal( s2[j] ) )
+                            // try with first two args as zero
+                            if (IsFloatSubnormal(s2[j]))
                             { // its fun to have fun,
                                 double correct4, correct5;
                                 float err4, err5;
 
-                                if( skipNanInf )
-                                    feclearexcept( FE_OVERFLOW );
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                correct2 = f->func.f_fma( 0.0f, 0.0f, s3[j], CORRECTLY_ROUNDED );
-                                correct3 = f->func.f_fma( -0.0f, 0.0f, s3[j], CORRECTLY_ROUNDED );
-                                correct4 = f->func.f_fma( 0.0f, -0.0f, s3[j], CORRECTLY_ROUNDED );
-                                correct5 = f->func.f_fma( -0.0f, -0.0f, s3[j], CORRECTLY_ROUNDED );
+                                correct2 = f->func.f_fma(0.0f, 0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
+                                correct3 = f->func.f_fma(-0.0f, 0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
+                                correct4 = f->func.f_fma(0.0f, -0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
+                                correct5 = f->func.f_fma(-0.0f, -0.0f, s3[j],
+                                                         CORRECTLY_ROUNDED);
 
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                if( !gInfNanSupport )
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) )
-                                        continue;
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                       IsFloatInfinity(correct3) || IsFloatNaN(correct3) ||
-                                       IsFloatInfinity(correct4) || IsFloatNaN(correct4) ||
-                                       IsFloatInfinity(correct5) || IsFloatNaN(correct5) )
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3)
+                                        || IsFloatInfinity(correct4)
+                                        || IsFloatNaN(correct4)
+                                        || IsFloatInfinity(correct5)
+                                        || IsFloatNaN(correct5))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                err4 = Ulp_Error( test, correct4  );
-                                err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) &&
-                                                 (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                err4 = Ulp_Error(test, correct4);
+                                err5 = Ulp_Error(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps))
+                                        && (!(fabsf(err3) <= float_ulps))
+                                        && (!(fabsf(err4) <= float_ulps))
+                                        && (!(fabsf(err5) <= float_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( 0.0f == test &&
-                                    ( 0.0f == f->func.f_fma(  0.0f,  0.0f, s3[j], FLUSHED )  ||
-                                      0.0f == f->func.f_fma( -0.0f,  0.0f, s3[j], FLUSHED )  ||
-                                      0.0f == f->func.f_fma(  0.0f, -0.0f, s3[j], FLUSHED )  ||
-                                      0.0f == f->func.f_fma( -0.0f, -0.0f, s3[j], FLUSHED )  )
-                                )
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, 0.0f, s3[j],
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, 0.0f, s3[j],
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, -0.0f, s3[j],
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, -0.0f,
+                                                             s3[j], FLUSHED)))
                                 {
                                     fail = 0;
                                     err = 0.0f;
                                 }
 
-                                if( IsFloatSubnormal( s3[j] )  )
+                                if (IsFloatSubnormal(s3[j]))
                                 {
-                                    if( test == 0.0f )  // 0*0+0 is 0
+                                    if (test == 0.0f) // 0*0+0 is 0
                                     {
                                         fail = 0;
                                         err = 0.0f;
                                     }
                                 }
                             }
-                            else if( IsFloatSubnormal( s3[j] ) )
+                            else if (IsFloatSubnormal(s3[j]))
                             {
                                 double correct4, correct5;
                                 float err4, err5;
 
-                                if( skipNanInf )
-                                    feclearexcept( FE_OVERFLOW );
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                correct2 = f->func.f_fma( 0.0f, s2[j], 0.0f, CORRECTLY_ROUNDED );
-                                correct3 = f->func.f_fma( -0.0f, s2[j], 0.0f, CORRECTLY_ROUNDED );
-                                correct4 = f->func.f_fma( 0.0f,  s2[j], -0.0f, CORRECTLY_ROUNDED );
-                                correct5 = f->func.f_fma( -0.0f, s2[j], -0.0f, CORRECTLY_ROUNDED );
+                                correct2 = f->func.f_fma(0.0f, s2[j], 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct3 = f->func.f_fma(-0.0f, s2[j], 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct4 = f->func.f_fma(0.0f, s2[j], -0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct5 = f->func.f_fma(-0.0f, s2[j], -0.0f,
+                                                         CORRECTLY_ROUNDED);
 
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                if( !gInfNanSupport )
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) )
-                                        continue;
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                       IsFloatInfinity(correct3) || IsFloatNaN(correct3) ||
-                                       IsFloatInfinity(correct4) || IsFloatNaN(correct4) ||
-                                       IsFloatInfinity(correct5) || IsFloatNaN(correct5) )
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3)
+                                        || IsFloatInfinity(correct4)
+                                        || IsFloatNaN(correct4)
+                                        || IsFloatInfinity(correct5)
+                                        || IsFloatNaN(correct5))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                err4 = Ulp_Error( test, correct4  );
-                                err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) &&
-                                                 (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                err4 = Ulp_Error(test, correct4);
+                                err5 = Ulp_Error(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps))
+                                        && (!(fabsf(err3) <= float_ulps))
+                                        && (!(fabsf(err4) <= float_ulps))
+                                        && (!(fabsf(err5) <= float_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( 0.0f == test &&
-                                    (   0.0f == f->func.f_fma( 0.0f, s2[j], 0.0f, FLUSHED )   ||
-                                        0.0f == f->func.f_fma(-0.0f, s2[j], 0.0f, FLUSHED )   ||
-                                        0.0f == f->func.f_fma( 0.0f, s2[j],-0.0f, FLUSHED )   ||
-                                        0.0f == f->func.f_fma(-0.0f, s2[j],-0.0f, FLUSHED )   )
-                                )
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, s2[j], 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, s2[j], 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, s2[j], -0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, s2[j],
+                                                             -0.0f, FLUSHED)))
                                 {
                                     fail = 0;
                                     err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsFloatSubnormal( s2[j] ) )
+                        else if (fail && IsFloatSubnormal(s2[j]))
                         {
                             double correct2, correct3;
                             float err2, err3;
 
-                            if( skipNanInf )
-                                feclearexcept( FE_OVERFLOW );
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = f->func.f_fma( s[j], 0.0f, s3[j], CORRECTLY_ROUNDED );
-                            correct3 = f->func.f_fma( s[j], -0.0f, s3[j], CORRECTLY_ROUNDED );
+                            correct2 = f->func.f_fma(s[j], 0.0f, s3[j],
+                                                     CORRECTLY_ROUNDED);
+                            correct3 = f->func.f_fma(s[j], -0.0f, s3[j],
+                                                     CORRECTLY_ROUNDED);
 
-                            if( skipNanInf )
+                            if (skipNanInf)
                             {
-                                if( fetestexcept( FE_OVERFLOW ) )
-                                    continue;
+                                if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                   IsFloatInfinity(correct3) || IsFloatNaN(correct3)   )
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps))
+                                    && (!(fabsf(err3) <= float_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( 0.0f == test &&
-                                (   0.0f == f->func.f_fma( s[j], 0.0f, s3[j], FLUSHED )  ||
-                                    0.0f == f->func.f_fma( s[j], -0.0f, s3[j], FLUSHED ) )
-                            )
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(s[j], 0.0f, s3[j],
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(s[j], -0.0f, s3[j],
+                                                         FLUSHED)))
                             {
                                 fail = 0;
                                 err = 0.0f;
                             }
 
-                            //try with second two args as zero
-                            if( IsFloatSubnormal( s3[j] ) )
+                            // try with second two args as zero
+                            if (IsFloatSubnormal(s3[j]))
                             {
                                 double correct4, correct5;
                                 float err4, err5;
 
-                                if( skipNanInf )
-                                    feclearexcept( FE_OVERFLOW );
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                                correct2 = f->func.f_fma( s[j], 0.0f, 0.0f, CORRECTLY_ROUNDED );
-                                correct3 = f->func.f_fma( s[j], -0.0f, 0.0f, CORRECTLY_ROUNDED );
-                                correct4 = f->func.f_fma( s[j], 0.0f, -0.0f, CORRECTLY_ROUNDED );
-                                correct5 = f->func.f_fma( s[j], -0.0f, -0.0f, CORRECTLY_ROUNDED );
+                                correct2 = f->func.f_fma(s[j], 0.0f, 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct3 = f->func.f_fma(s[j], -0.0f, 0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct4 = f->func.f_fma(s[j], 0.0f, -0.0f,
+                                                         CORRECTLY_ROUNDED);
+                                correct5 = f->func.f_fma(s[j], -0.0f, -0.0f,
+                                                         CORRECTLY_ROUNDED);
 
-                                // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                                if( !gInfNanSupport )
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
                                 {
-                                    if( fetestexcept(FE_OVERFLOW) )
-                                        continue;
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                    // Note: no double rounding here.  Reference functions calculate in single precision.
-                                    if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                       IsFloatInfinity(correct3) || IsFloatNaN(correct3) ||
-                                       IsFloatInfinity(correct4) || IsFloatNaN(correct4) ||
-                                       IsFloatInfinity(correct5) || IsFloatNaN(correct5) )
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsFloatInfinity(correct2)
+                                        || IsFloatNaN(correct2)
+                                        || IsFloatInfinity(correct3)
+                                        || IsFloatNaN(correct3)
+                                        || IsFloatInfinity(correct4)
+                                        || IsFloatNaN(correct4)
+                                        || IsFloatInfinity(correct5)
+                                        || IsFloatNaN(correct5))
                                         continue;
                                 }
 
-                                err2 = Ulp_Error( test, correct2  );
-                                err3 = Ulp_Error( test, correct3  );
-                                err4 = Ulp_Error( test, correct4  );
-                                err5 = Ulp_Error( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) &&
-                                                 (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
+                                err4 = Ulp_Error(test, correct4);
+                                err5 = Ulp_Error(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= float_ulps))
+                                        && (!(fabsf(err3) <= float_ulps))
+                                        && (!(fabsf(err4) <= float_ulps))
+                                        && (!(fabsf(err5) <= float_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( 0.0f == test &&
-                                    (   0.0f == f->func.f_fma( s[j], 0.0f, 0.0f, FLUSHED )    ||
-                                        0.0f == f->func.f_fma( s[j],-0.0f, 0.0f, FLUSHED )    ||
-                                        0.0f == f->func.f_fma( s[j], 0.0f,-0.0f, FLUSHED )    ||
-                                        0.0f == f->func.f_fma( s[j],-0.0f,-0.0f, FLUSHED )    )
-                                )
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(s[j], 0.0f, 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(s[j], -0.0f, 0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(s[j], 0.0f, -0.0f,
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(s[j], -0.0f, -0.0f,
+                                                             FLUSHED)))
                                 {
                                     fail = 0;
                                     err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsFloatSubnormal(s3[j]) )
+                        else if (fail && IsFloatSubnormal(s3[j]))
                         {
                             double correct2, correct3;
                             float err2, err3;
 
-                            if( skipNanInf )
-                                feclearexcept( FE_OVERFLOW );
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
 
-                            correct2 = f->func.f_fma( s[j], s2[j], 0.0f, CORRECTLY_ROUNDED );
-                            correct3 = f->func.f_fma( s[j], s2[j], -0.0f, CORRECTLY_ROUNDED );
+                            correct2 = f->func.f_fma(s[j], s2[j], 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                            correct3 = f->func.f_fma(s[j], s2[j], -0.0f,
+                                                     CORRECTLY_ROUNDED);
 
-                            if( skipNanInf )
+                            if (skipNanInf)
                             {
-                                if( fetestexcept( FE_OVERFLOW ) )
-                                    continue;
+                                if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) ||
-                                   IsFloatInfinity(correct3) || IsFloatNaN(correct3)   )
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3))
                                     continue;
                             }
 
-                            err2 = Ulp_Error( test, correct2  );
-                            err3 = Ulp_Error( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            err2 = Ulp_Error(test, correct2);
+                            err3 = Ulp_Error(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps))
+                                    && (!(fabsf(err3) <= float_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( 0.0f == test &&
-                                (   0.0f == f->func.f_fma( s[j], s2[j], 0.0f, FLUSHED ) ||
-                                    0.0f == f->func.f_fma( s[j], s2[j],-0.0f, FLUSHED )  )
-                            )
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(s[j], s2[j], 0.0f,
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(s[j], s2[j], -0.0f,
+                                                         FLUSHED)))
                             {
                                 fail = 0;
                                 err = 0.0f;
@@ -713,7 +927,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                         }
                     }
 
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
@@ -721,9 +935,14 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                         maxErrorVal3 = s3[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a, %a} ({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((cl_uint*)s)[j], ((cl_uint*)s2)[j], ((cl_uint*)s3)[j],  ((float*) gOut_Ref)[j], test );
+                        vlog_error(
+                            "\nERROR: %s%s: %f ulp error at {%a, %a, %a} "
+                            "({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n",
+                            f->name, sizeNames[k], err, s[j], s2[j], s3[j],
+                            ((cl_uint *)s)[j], ((cl_uint *)s2)[j],
+                            ((cl_uint *)s3)[j], ((float *)gOut_Ref)[j], test);
                         error = -1;
                         goto exit;
                     }
@@ -731,105 +950,135 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,  bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
         uint32_t *p2 = (uint32_t *)gIn2;
         uint32_t *p3 = (uint32_t *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -840,18 +1089,75 @@ exit:
 
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
 
-    +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
 };
 
-static const size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] );
+static const size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
 
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
@@ -860,8 +1166,8 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
@@ -869,7 +1175,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     double maxErrorVal3 = 0.0f;
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(double), bufferSize);
 
     Force64BitFPUPrecision();
@@ -877,360 +1183,463 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info ) ))
+                               &build_info)))
     {
         return error;
     }
     /*
      for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-     if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-     return error;
+     if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i,
+     programs + i) ) ) return error;
      */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
         j = 0;
-        if( i == 0 )
+        if (i == 0)
         { // test edge cases
-            uint32_t x, y, z;  x = y = z = 0;
-            for( ; j < bufferSize / sizeof( double ); j++ )
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; j < bufferSize / sizeof(double); j++)
             {
                 p[j] = specialValuesDouble[x];
                 p2[j] = specialValuesDouble[y];
                 p3[j] = specialValuesDouble[z];
-                if( ++x >= specialValuesDoubleCount )
+                if (++x >= specialValuesDoubleCount)
                 {
                     x = 0;
-                    if( ++y >= specialValuesDoubleCount )
+                    if (++y >= specialValuesDoubleCount)
                     {
                         y = 0;
-                        if( ++z >= specialValuesDoubleCount )
-                            break;
+                        if (++z >= specialValuesDoubleCount) break;
                     }
                 }
             }
-            if( j == bufferSize / sizeof( double ) )
-                vlog_error( "Test Error: not all special cases tested!\n" );
+            if (j == bufferSize / sizeof(double))
+                vlog_error("Test Error: not all special cases tested!\n");
         }
 
-        for( ; j < bufferSize / sizeof( double ); j++ )
+        for (; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         double *s = (double *)gIn;
         double *s2 = (double *)gIn2;
         double *s3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            r[j] = (double) f->dfunc.f_fff( s[j], s2[j], s3[j] );
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    double test = ((double*) q)[j];
-                    long double correct = f->dfunc.f_fff( s[j], s2[j], s3[j] );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    int fail = ! (fabsf(err) <= f->double_ulps);
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    int fail = !(fabsf(err) <= f->double_ulps);
 
-                    if( fail && ftz )
+                    if (fail && ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleSubnormal(correct) )
+                        if (IsDoubleSubnormal(correct))
                         { // look at me,
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( fail && IsDoubleSubnormal( s[j] ) )
+                        if (fail && IsDoubleSubnormal(s[j]))
                         { // look at me,
-                            long double correct2 = f->dfunc.f_fff( 0.0, s2[j], s3[j] );
-                            long double correct3 = f->dfunc.f_fff( -0.0, s2[j], s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 =
+                                f->dfunc.f_fff(0.0, s2[j], s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(-0.0, s2[j], s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             { // look at me now,
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with first two args as zero
-                            if( IsDoubleSubnormal( s2[j] ) )
+                            // try with first two args as zero
+                            if (IsDoubleSubnormal(s2[j]))
                             { // its fun to have fun,
-                                correct2 = f->dfunc.f_fff( 0.0, 0.0, s3[j] );
-                                correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] );
-                                long double correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] );
-                                long double correct5 = f->dfunc.f_fff( -0.0, -0.0, s3[j] );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]);
+                                correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, -0.0, s3[j]);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, -0.0, s3[j]);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                   IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
 
-                                if( IsDoubleSubnormal( s3[j] )  )
+                                if (IsDoubleSubnormal(s3[j]))
                                 { // but you have to know how!
-                                    correct2 = f->dfunc.f_fff( 0.0, 0.0, 0.0f );
-                                    correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f );
-                                    correct4 = f->dfunc.f_fff( 0.0, -0.0, 0.0f );
-                                    correct5 = f->dfunc.f_fff( -0.0, -0.0, 0.0f );
-                                    long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f );
-                                    long double correct7 = f->dfunc.f_fff( -0.0, 0.0, -0.0f );
-                                    long double correct8 = f->dfunc.f_fff( 0.0, -0.0, -0.0f );
-                                    long double correct9 = f->dfunc.f_fff( -0.0, -0.0, -0.0f );
-                                    err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                    err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                    err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                    err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                    float err6 = Bruteforce_Ulp_Error_Double( test, correct6  );
-                                    float err7 = Bruteforce_Ulp_Error_Double( test, correct7  );
-                                    float err8 = Bruteforce_Ulp_Error_Double( test, correct8  );
-                                    float err9 = Bruteforce_Ulp_Error_Double( test, correct9  );
-                                    fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                     (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) &&
-                                                     (!(fabsf(err5) <= f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) &&
-                                                     (!(fabsf(err7) <= f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps)));
-                                    if( fabsf( err2 ) < fabsf(err ) )
-                                        err = err2;
-                                    if( fabsf( err3 ) < fabsf(err ) )
-                                        err = err3;
-                                    if( fabsf( err4 ) < fabsf(err ) )
-                                        err = err4;
-                                    if( fabsf( err5 ) < fabsf(err ) )
-                                        err = err5;
-                                    if( fabsf( err6 ) < fabsf(err ) )
-                                        err = err6;
-                                    if( fabsf( err7 ) < fabsf(err ) )
-                                        err = err7;
-                                    if( fabsf( err8 ) < fabsf(err ) )
-                                        err = err8;
-                                    if( fabsf( err9 ) < fabsf(err ) )
-                                        err = err9;
+                                    correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f);
+                                    correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f);
+                                    correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f);
+                                    correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f);
+                                    long double correct6 =
+                                        f->dfunc.f_fff(0.0, 0.0, -0.0f);
+                                    long double correct7 =
+                                        f->dfunc.f_fff(-0.0, 0.0, -0.0f);
+                                    long double correct8 =
+                                        f->dfunc.f_fff(0.0, -0.0, -0.0f);
+                                    long double correct9 =
+                                        f->dfunc.f_fff(-0.0, -0.0, -0.0f);
+                                    err2 = Bruteforce_Ulp_Error_Double(
+                                        test, correct2);
+                                    err3 = Bruteforce_Ulp_Error_Double(
+                                        test, correct3);
+                                    err4 = Bruteforce_Ulp_Error_Double(
+                                        test, correct4);
+                                    err5 = Bruteforce_Ulp_Error_Double(
+                                        test, correct5);
+                                    float err6 = Bruteforce_Ulp_Error_Double(
+                                        test, correct6);
+                                    float err7 = Bruteforce_Ulp_Error_Double(
+                                        test, correct7);
+                                    float err8 = Bruteforce_Ulp_Error_Double(
+                                        test, correct8);
+                                    float err9 = Bruteforce_Ulp_Error_Double(
+                                        test, correct9);
+                                    fail = fail
+                                        && ((!(fabsf(err2) <= f->double_ulps))
+                                            && (!(fabsf(err3)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err4)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err6)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err7)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err8)
+                                                  <= f->double_ulps)));
+                                    if (fabsf(err2) < fabsf(err)) err = err2;
+                                    if (fabsf(err3) < fabsf(err)) err = err3;
+                                    if (fabsf(err4) < fabsf(err)) err = err4;
+                                    if (fabsf(err5) < fabsf(err)) err = err5;
+                                    if (fabsf(err6) < fabsf(err)) err = err6;
+                                    if (fabsf(err7) < fabsf(err)) err = err7;
+                                    if (fabsf(err8) < fabsf(err)) err = err8;
+                                    if (fabsf(err9) < fabsf(err)) err = err9;
 
                                     // retry per section 6.5.3.4
-                                    if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                       IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps )  ||
-                                       IsDoubleResultSubnormal( correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7, f->double_ulps )  ||
-                                       IsDoubleResultSubnormal( correct8, f->double_ulps ) || IsDoubleResultSubnormal( correct9, f->double_ulps )  )
+                                    if (IsDoubleResultSubnormal(correct2,
+                                                                f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct3, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct4, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct5, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct6, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct7, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct8, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct9, f->double_ulps))
                                     {
-                                        fail = fail && ( test != 0.0f);
-                                        if( ! fail )
-                                            err = 0.0f;
+                                        fail = fail && (test != 0.0f);
+                                        if (!fail) err = 0.0f;
                                     }
                                 }
                             }
-                            else if( IsDoubleSubnormal( s3[j] ) )
+                            else if (IsDoubleSubnormal(s3[j]))
                             {
-                                correct2 = f->dfunc.f_fff( 0.0, s2[j], 0.0 );
-                                correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 );
-                                long double correct4 = f->dfunc.f_fff( 0.0,  s2[j], -0.0 );
-                                long double correct5 = f->dfunc.f_fff( -0.0, s2[j], -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0);
+                                correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, s2[j], -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, s2[j], -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )  ||
-                                   IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsDoubleSubnormal( s2[j] ) )
+                        else if (fail && IsDoubleSubnormal(s2[j]))
                         {
-                            long double correct2 = f->dfunc.f_fff( s[j], 0.0, s3[j] );
-                            long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j] );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], 0.0, s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], -0.0, s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps )  || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
 
-                            //try with second two args as zero
-                            if( IsDoubleSubnormal( s3[j] ) )
+                            // try with second two args as zero
+                            if (IsDoubleSubnormal(s3[j]))
                             {
-                                correct2 = f->dfunc.f_fff( s[j], 0.0, 0.0 );
-                                correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 );
-                                long double correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 );
-                                long double correct5 = f->dfunc.f_fff( s[j], -0.0, -0.0 );
-                                err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                                err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                                float err4 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                                float err5 = Bruteforce_Ulp_Error_Double( test, correct5  );
-                                fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) &&
-                                                 (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)));
-                                if( fabsf( err2 ) < fabsf(err ) )
-                                    err = err2;
-                                if( fabsf( err3 ) < fabsf(err ) )
-                                    err = err3;
-                                if( fabsf( err4 ) < fabsf(err ) )
-                                    err = err4;
-                                if( fabsf( err5 ) < fabsf(err ) )
-                                    err = err5;
+                                correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0);
+                                correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(s[j], 0.0, -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(s[j], -0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
 
                                 // retry per section 6.5.3.4
-                                if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ||
-                                   IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && ( test != 0.0f);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
                         }
-                        else if( fail && IsDoubleSubnormal(s3[j]) )
+                        else if (fail && IsDoubleSubnormal(s3[j]))
                         {
-                            long double correct2 = f->dfunc.f_fff( s[j], s2[j], 0.0 );
-                            long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], s2[j], 0.0);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], s2[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
 
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
@@ -1238,9 +1647,12 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
                         maxErrorVal3 = s3[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: %f ulp error at {%.13la, %.13la, %.13la}: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((double*) gOut_Ref)[j], test );
+                        vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, "
+                                   "%.13la, %.13la}: *%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err, s[j], s2[j],
+                                   s3[j], ((double *)gOut_Ref)[j], test);
                         error = -1;
                         goto exit;
                     }
@@ -1248,107 +1660,136 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
         double *p2 = (double *)gIn2;
         double *p3 = (double *)gIn3;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
             p[j] = DoubleFromUInt32(genrand_int32(d));
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeof( cl_double ) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;    // bufferSize / vectorSize  rounded up
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; }
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -1356,5 +1797,3 @@ exit:
 
     return error;
 }
-
-
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 8ef33119..61a8546b 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -18,8 +18,8 @@
 #include <string.h>
 #include "FunctionList.h"
 
-#if defined( __APPLE__ )
-    #include <sys/time.h>
+#if defined(__APPLE__)
+#include <sys/time.h>
 #endif
 
 int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode);
@@ -37,61 +37,77 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       f0 = ", name, "( f0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -101,63 +117,80 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
 {
-    const char *c[] = {     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in)\n"
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                        "       f0 = ", name, "( f0 );\n"
-                        "       vstore3( f0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       double3 f0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       f0 = ", name, "( f0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = f0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = f0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
                        relaxedMode);
@@ -165,91 +198,102 @@ static int BuildKernelDouble(const char *name, int vectorSize,
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_uint     kernel_count;
-    cl_kernel   **kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernel_count,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
                              info->kernels[i], info->programs + i,
                              info->relaxedMode);
 }
 
-//Thread specific data for a worker thread
+// Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem      inBuf;                              // input buffer for the thread
-    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
-    float       maxError;                           // max error value. Init to 0.
-    double      maxErrorValue;                      // position of the max error value.  Init to 0.
-    cl_command_queue tQueue;                        // per thread command queue to improve performance
-}ThreadInfo;
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double maxErrorValue; // position of the max error value.  Init to 0.
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
 
 typedef struct TestInfo
 {
-    size_t      subBufferSize;                      // Size of the sub-buffer in elements
-    const Func  *f;                                 // A pointer to the function info
-    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
-    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
-    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
-    cl_uint     threadCount;                        // Number of worker threads
-    cl_uint     jobCount;                           // Number of jobs
-    cl_uint     step;                               // step between each chunk and the next.
-    cl_uint     scale;                              // stride between individual test values
-    float       ulps;                               // max_allowed ulps
-    int         ftz;                                // non-zero if running in flush to zero mode
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
 
-    int         isRangeLimited;                     // 1 if the function is only to be evaluated over a range
-    float       half_sin_cos_tan_limit;
+    int isRangeLimited; // 1 if the function is only to be evaluated over a
+                        // range
+    float half_sin_cos_tan_limit;
     bool relaxedMode; // True if test is to be run in relaxed mode, false
                       // otherwise.
-}TestInfo;
+} TestInfo;
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
     int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
 
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -259,69 +303,89 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     test_info.relaxedMode = relaxedMode;
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
-
     }
 
     // Check for special cases for unary float
     test_info.isRangeLimited = 0;
     test_info.half_sin_cos_tan_limit = 0;
-    if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") )
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
     {
         test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = 1.0f + test_info.ulps * (FLT_EPSILON/2.0f);             // out of range results from finite inputs must be in [-1,1]
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
     }
-    else if( 0 == strcmp( f->name, "half_tan"))
+    else if (0 == strcmp(f->name, "half_tan"))
     {
         test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = INFINITY;             // out of range resut from finite inputs must be numeric
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
     }
 
     // Init the kernels
@@ -330,141 +394,156 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
             goto exit;
     }
 
-    if( !gSkipCorrectnessTesting || skipTestingRelaxed)
+    if (!gSkipCorrectnessTesting || skipTestingRelaxed)
     {
-        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
 
-        if( skipTestingRelaxed )
+        if (skipTestingRelaxed)
         {
-          vlog(" (rlx skip correctness testing)\n");
-          goto exit;
+            vlog(" (rlx skip correctness testing)\n");
+            goto exit;
         }
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
-                ((float*)p)[j] = (float) genrand_real1(d);
-        else if( strstr( f->name, "log" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+        if (strstr(f->name, "exp") || strstr(f->name, "sin")
+            || strstr(f->name, "cos") || strstr(f->name, "tan"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+                ((float *)p)[j] = (float)genrand_real1(d);
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = genrand_int32(d) & 0x7fffffff;
         else
-            for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+            for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
                 p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError( test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime( endTime, startTime );
+                double current_time = SubtractTime(endTime, startTime);
                 sum += current_time;
-                if( current_time < bestTime )
-                    bestTime = current_time;
+                if (current_time < bestTime) bestTime = current_time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
 
-static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_float );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr    func = job->f->func;
-    const char * fname = job->f->name;
+    fptr func = job->f->func;
+    const char *fname = job->f->name;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
     if (relaxedMode)
@@ -480,153 +559,177 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
     int ftz = job->ftz;
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_uint  *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
     {
-      p[j] = base + j * scale;
-      if (relaxedMode)
-      {
-        float p_j = *(float *) &p[j];
-        if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 )  //the domain of the function is [-pi,pi]
+        p[j] = base + j * scale;
+        if (relaxedMode)
         {
-            if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN;
-        }
+            float p_j = *(float *)&p[j];
+            if (strcmp(fname, "sin") == 0
+                || strcmp(fname, "cos")
+                    == 0) // the domain of the function is [-pi,pi]
+            {
+                if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN;
+            }
 
-        if ( strcmp( fname, "reciprocal" ) == 0 )
-        {
-            const float l_limit = HEX_FLT(+, 1, 0, -, 126);
-            const float u_limit = HEX_FLT(+, 1, 0, +, 126);
+            if (strcmp(fname, "reciprocal") == 0)
+            {
+                const float l_limit = HEX_FLT(+, 1, 0, -, 126);
+                const float u_limit = HEX_FLT(+, 1, 0, +, 126);
 
-            if (fabs(p_j) < l_limit
-                || fabs(p_j)
-                    > u_limit) // the domain of the function is [2^-126,2^126]
-                ((float *)p)[j] = NAN;
+                if (fabs(p_j) < l_limit
+                    || fabs(p_j) > u_limit) // the domain of the function is
+                                            // [2^-126,2^126]
+                    ((float *)p)[j] = NAN;
+            }
         }
-      }
     }
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     float *r = (float *)gOut_Ref + thread_id * buffer_elements;
     float *s = (float *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (float) func.f_f( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (uint32_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
-    //Verify data
+    // Verify data
     uint32_t *t = (uint32_t *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             uint32_t *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                float test = ((float*) q)[j];
-                double correct = func.f_f( s[j] );
-                float err = Ulp_Error( test, correct );
-                float abs_error = Abs_Error( test, correct );
+                float test = ((float *)q)[j];
+                double correct = func.f_f(s[j]);
+                float err = Ulp_Error(test, correct);
+                float abs_error = Abs_Error(test, correct);
                 int fail = 0;
                 int use_abs_error = 0;
 
-                // it is possible for the output to not match the reference result but for Ulp_Error
-                // to be zero, for example -1.#QNAN vs. 1.#QNAN. In such cases there is no failure
+                // it is possible for the output to not match the reference
+                // result but for Ulp_Error to be zero, for example -1.#QNAN
+                // vs. 1.#QNAN. In such cases there is no failure
                 if (err == 0.0f)
                 {
                     fail = 0;
                 }
                 else if (relaxedMode)
                 {
-                    if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 )
+                    if (strcmp(fname, "sin") == 0 || strcmp(fname, "cos") == 0)
                     {
-                        fail = ! (fabsf(abs_error) <= ulps);
+                        fail = !(fabsf(abs_error) <= ulps);
                         use_abs_error = 1;
                     }
                     if (strcmp(fname, "sinpi") == 0
@@ -639,12 +742,12 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
                         }
                     }
 
-                    if ( strcmp(fname, "reciprocal") == 0 )
+                    if (strcmp(fname, "reciprocal") == 0)
                     {
-                        fail = ! (fabsf(err) <= ulps);
+                        fail = !(fabsf(err) <= ulps);
                     }
 
-                    if ( strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0 )
+                    if (strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0)
                     {
                         float exp_error = ulps;
 
@@ -653,153 +756,171 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
                             exp_error += floor(fabs(2 * s[j]));
                         }
 
-                        fail = ! (fabsf(err) <= exp_error);
+                        fail = !(fabsf(err) <= exp_error);
                         ulps = exp_error;
                     }
-                    if (strcmp(fname, "tan") == 0) {
+                    if (strcmp(fname, "tan") == 0)
+                    {
 
-                        if(  !gFastRelaxedDerived )
+                        if (!gFastRelaxedDerived)
                         {
-                            fail = ! (fabsf(err) <= ulps);
+                            fail = !(fabsf(err) <= ulps);
                         }
-                        // Else fast math derived implementation does not require ULP verification
+                        // Else fast math derived implementation does not
+                        // require ULP verification
                     }
                     if (strcmp(fname, "exp10") == 0)
                     {
-                        if(  !gFastRelaxedDerived )
+                        if (!gFastRelaxedDerived)
                         {
-                            fail = ! (fabsf(err) <= ulps);
+                            fail = !(fabsf(err) <= ulps);
                         }
-                        // Else fast math derived implementation does not require ULP verification
+                        // Else fast math derived implementation does not
+                        // require ULP verification
                     }
                     if (strcmp(fname, "log") == 0 || strcmp(fname, "log2") == 0
                         || strcmp(fname, "log10") == 0)
                     {
-                        if( s[j] >= 0.5 && s[j] <= 2 )
+                        if (s[j] >= 0.5 && s[j] <= 2)
                         {
-                            fail = ! (fabsf(abs_error) <= ulps );
+                            fail = !(fabsf(abs_error) <= ulps);
                         }
                         else
                         {
-                            ulps = gIsEmbedded ? job->f->float_embedded_ulps : job->f->float_ulps;
-                            fail = ! (fabsf(err) <= ulps);
+                            ulps = gIsEmbedded ? job->f->float_embedded_ulps
+                                               : job->f->float_ulps;
+                            fail = !(fabsf(err) <= ulps);
                         }
                     }
 
 
                     // fast-relaxed implies finite-only
-                    if( IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                        IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        ) {
+                    if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                        || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
+                    {
                         fail = 0;
                         err = 0;
                     }
                 }
                 else
                 {
-                  fail = ! (fabsf(err) <= ulps);
+                    fail = !(fabsf(err) <= ulps);
                 }
 
                 // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
+                if (isRangeLimited
+                    && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
+                    && fabsf(s[j]) < INFINITY)
                 {
-                    if( fabsf( test ) <= half_sin_cos_tan_limit )
+                    if (fabsf(test) <= half_sin_cos_tan_limit)
                     {
                         err = 0;
                         fail = 0;
                     }
                 }
 
-                if( fail )
+                if (fail)
                 {
-                    if( ftz )
+                    if (ftz)
                     {
-                        typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
+                        typedef int (*CheckForSubnormal)(
+                            double, float); // If we are in fast relaxed math,
+                                            // we have a different calculation
+                                            // for the subnormal threshold.
                         CheckForSubnormal isFloatResultSubnormalPtr;
 
                         if (relaxedMode)
                         {
-                          isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
+                            isFloatResultSubnormalPtr =
+                                &IsFloatResultSubnormalAbsError;
                         }
                         else
                         {
-                          isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
+                            isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
                         }
                         // retry per section 6.5.3.2
-                        if( (*isFloatResultSubnormalPtr)(correct, ulps) )
+                        if ((*isFloatResultSubnormalPtr)(correct, ulps))
                         {
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
-                            double correct2 = func.f_f( 0.0 );
-                            double correct3 = func.f_f( -0.0 );
+                            double correct2 = func.f_f(0.0);
+                            double correct3 = func.f_f(-0.0);
                             float err2;
                             float err3;
-                            if( use_abs_error )
+                            if (use_abs_error)
                             {
-                              err2 = Abs_Error( test, correct2  );
-                              err3 = Abs_Error( test, correct3  );
+                                err2 = Abs_Error(test, correct2);
+                                err3 = Abs_Error(test, correct3);
                             }
                             else
                             {
-                              err2 = Ulp_Error( test, correct2  );
-                              err3 = Ulp_Error( test, correct3  );
+                                err2 = Ulp_Error(test, correct2);
+                                err3 = Ulp_Error(test, correct3);
                             }
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( (*isFloatResultSubnormalPtr)(correct2, ulps ) || (*isFloatResultSubnormalPtr)(correct3, ulps ) )
+                            if ((*isFloatResultSubnormalPtr)(correct2, ulps)
+                                || (*isFloatResultSubnormalPtr)(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
                 }
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at %a (0x%8.8x): *%a vs. %a\n", job->f->name, sizeNames[k], err, ((float*) s)[j], ((uint32_t*) s)[j], ((float*) t)[j], test);
+                    vlog_error("\nERROR: %s%s: %f ulp error at %a (0x%8.8x): "
+                               "*%a vs. %a\n",
+                               job->f->name, sizeNames[k], err, ((float *)s)[j],
+                               ((uint32_t *)s)[j], ((float *)t)[j], test);
                     return -1;
                 }
             }
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
@@ -808,17 +929,16 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
 }
 
 
-
-static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 {
-    const TestInfo *job = (const TestInfo *) data;
-    size_t  buffer_elements = job->subBufferSize;
-    size_t  buffer_size = buffer_elements * sizeof( cl_double );
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
     cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint) job->step;
+    cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    float   ulps = job->ulps;
-    dptr    func = job->f->dfunc;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
     cl_uint j, k;
     cl_int error;
     int ftz = job->ftz;
@@ -826,190 +946,221 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
     Force64BitFPUPrecision();
 
     // start the map of the output arrays
-    cl_event e[ VECTOR_SIZE_COUNT ];
-    cl_ulong *out[ VECTOR_SIZE_COUNT ];
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
-        if( error || NULL == out[j])
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
     // Write the new values to the input array
-    cl_double *p = (cl_double*) gIn + thread_id * buffer_elements;
-    for( j = 0; j < buffer_elements; j++ )
-        p[j] = DoubleFromUInt32( base + j * scale);
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
 
-    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
     {
-        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
         return error;
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        //Wait for the map to finish
-        if( (error = clWaitForEvents(1, e + j) ))
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
         {
-            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
             return error;
         }
-        if( (error = clReleaseEvent( e[j] ) ))
+        if ((error = clReleaseEvent(e[j])))
         {
-            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
             return error;
         }
 
-        // Fill the result buffer with garbage, so that old results don't carry over
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
         uint32_t pattern = 0xffffdead;
         memset_pattern4(out[j], &pattern, buffer_size);
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
             return error;
         }
 
         // run the kernel
-        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
         cl_program program = job->programs[j];
 
-        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
-        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
-
-        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
         {
-            vlog_error( "FAILED -- could not execute kernel\n" );
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
             return error;
         }
     }
 
 
     // Get that moving
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 2 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
-    if( gSkipCorrectnessTesting )
-        return CL_SUCCESS;
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
 
-    //Calculate the correctly rounded reference result
+    // Calculate the correctly rounded reference result
     cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
     cl_double *s = (cl_double *)p;
-    for( j = 0; j < buffer_elements; j++ )
-        r[j] = (cl_double) func.f_f( s[j] );
+    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
 
-    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
-    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-        if( error || NULL == out[j] )
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
         {
-            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
             return error;
         }
     }
     // Wait for the last buffer
-    out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
-    if( error || NULL == out[j] )
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
     {
-        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
         return error;
     }
 
 
-    //Verify data
+    // Verify data
     cl_ulong *t = (cl_ulong *)r;
-    for( j = 0; j < buffer_elements; j++ )
+    for (j = 0; j < buffer_elements; j++)
     {
-        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ulong *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if( t[j] != q[j] )
+            if (t[j] != q[j])
             {
-                cl_double test = ((cl_double*) q)[j];
-                long double correct = func.f_f( s[j] );
-                float err = Bruteforce_Ulp_Error_Double( test, correct );
-                int fail = ! (fabsf(err) <= ulps);
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_f(s[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
 
-                if( fail )
+                if (fail)
                 {
-                    if( ftz )
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, ulps) )
+                        if (IsDoubleResultSubnormal(correct, ulps))
                         {
-                            fail = fail && ( test != 0.0f );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
-                            long double correct2 = func.f_f( 0.0L );
-                            long double correct3 = func.f_f( -0.0L );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct2  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            fail =  fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
-                            if( fabsf( err2 ) < fabsf(err ) )
-                                err = err2;
-                            if( fabsf( err3 ) < fabsf(err ) )
-                                err = err3;
+                            long double correct2 = func.f_f(0.0L);
+                            long double correct3 = func.f_f(-0.0L);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal(correct2, ulps ) || IsDoubleResultSubnormal(correct3, ulps ) )
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
                             {
-                                fail = fail && ( test != 0.0f);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
                 }
-                if( fabsf(err ) > tinfo->maxError )
+                if (fabsf(err) > tinfo->maxError)
                 {
                     tinfo->maxError = fabsf(err);
                     tinfo->maxErrorValue = s[j];
                 }
-                if( fail )
+                if (fail)
                 {
-                    vlog_error( "\nERROR: %s%s: %f ulp error at %.13la (0x%16.16llx): *%.13la vs. %.13la\n", job->f->name, sizeNames[k], err, ((cl_double*) gIn)[j], ((cl_ulong*) gIn)[j], ((cl_double*) gOut_Ref)[j], test );
+                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
+                               "(0x%16.16llx): *%.13la vs. %.13la\n",
+                               job->f->name, sizeNames[k], err,
+                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
+                               ((cl_double *)gOut_Ref)[j], test);
                     return -1;
                 }
             }
         }
     }
 
-    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
         {
-            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
             return error;
         }
     }
 
-    if( (error = clFlush(tinfo->tQueue) ))
-        vlog( "clFlush 3 failed\n" );
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
 
-    if( 0 == ( base & 0x0fffffff) )
+    if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, buffer_elements, job->scale, job->ulps, job->threadCount);
-        } else
+            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, buffer_elements, job->scale, job->ulps,
+                 job->threadCount);
+        }
+        else
         {
-            vlog("." );
+            vlog(".");
         }
         fflush(stdout);
     }
@@ -1019,33 +1170,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
 
 int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 {
-    TestInfo    test_info;
-    cl_int      error;
-    size_t      i, j;
-    float       maxError = 0.0f;
-    double      maxErrorVal = 0.0;
-#if defined( __APPLE__ )
-    struct timeval  time_val;
-    gettimeofday( &time_val, NULL );
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+#if defined(__APPLE__)
+    struct timeval time_val;
+    gettimeofday(&time_val, NULL);
     double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
     double end_time;
 #endif
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     // Init test_info
-    memset( &test_info, 0, sizeof( test_info ) );
+    memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
     if (gWimpyMode)
     {
-        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
-        //there was overflow
+        // there was overflow
         test_info.jobCount = 1;
     }
     else
@@ -1058,52 +1212,69 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.ftz = f->ftz || gForceFTZ;
     test_info.relaxedMode = relaxedMode;
 
-    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
-        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
-        test_info.k[i] = (cl_kernel*)malloc( array_size );
-        if( NULL == test_info.k[i] )
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
         {
-            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
             error = CL_OUT_OF_HOST_MEMORY;
             goto exit;
         }
-        memset( test_info.k[i], 0, array_size );
+        memset(test_info.k[i], 0, array_size);
     }
-    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
-    if( NULL == test_info.tinfo )
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
     {
-        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
         error = CL_OUT_OF_HOST_MEMORY;
         goto exit;
     }
-    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
-    for( i = 0; i < test_info.threadCount; i++ )
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
     {
-        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
-        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if( error || NULL == test_info.tinfo[i].inBuf)
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
         {
-            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
             goto exit;
         }
 
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: 9461 read-write flags must be compatible with
+             * parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
             /* Qualcomm fix: end */
-            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
                 goto exit;
             }
         }
-        test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if( NULL == test_info.tinfo[i].tQueue || error )
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
         {
-            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
     }
@@ -1114,136 +1285,147 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             gMinVectorSizeIndex, test_info.threadCount, test_info.k,
             test_info.programs,  f->nameInCode,         relaxedMode
         };
-        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
-           goto exit;
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
     }
 
-    if( !gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
 
         // Accumulate the arithmetic errors
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
-            if( test_info.tinfo[i].maxError > maxError )
+            if (test_info.tinfo[i].maxError > maxError)
             {
                 maxError = test_info.tinfo[i].maxError;
                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
             }
         }
 
-        if( error )
-            goto exit;
+        if (error) goto exit;
 
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
 
-#if defined( __APPLE__ )
-    gettimeofday( &time_val, NULL);
+#if defined(__APPLE__)
+    gettimeofday(&time_val, NULL);
     end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
 #endif
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
 
-        if( strstr( f->name, "exp" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
+        if (strstr(f->name, "exp"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
                 p[j] = (double)genrand_real1(d);
-        else if( strstr( f->name, "log" ) )
-            for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
-                p[j] = fabs(DoubleFromUInt32( genrand_int32(d)));
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = fabs(DoubleFromUInt32(genrand_int32(d)));
         else
-            for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
-                p[j] = DoubleFromUInt32( genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime( endTime, startTime );
+                double current_time = SubtractTime(endTime, startTime);
                 sum += current_time;
-                if( current_time < bestTime )
-                    bestTime = current_time;
+                if (current_time < bestTime) bestTime = current_time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
 
-#if defined( __APPLE__ )
-    vlog( "\t(%2.2f seconds)", end_time - start_time );
+#if defined(__APPLE__)
+    vlog("\t(%2.2f seconds)", end_time - start_time);
 #endif
-    vlog( "\n" );
+    vlog("\n");
 
 exit:
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
-        if( test_info.k[i] )
+        if (test_info.k[i])
         {
-            for( j = 0; j < test_info.threadCount; j++ )
+            for (j = 0; j < test_info.threadCount; j++)
                 clReleaseKernel(test_info.k[i][j]);
 
-            free( test_info.k[i] );
+            free(test_info.k[i]);
         }
     }
-    if( test_info.tinfo )
+    if (test_info.tinfo)
     {
-        for( i = 0; i < test_info.threadCount; i++ )
+        for (i = 0; i < test_info.threadCount; i++)
         {
             clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
                 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
             clReleaseCommandQueue(test_info.tinfo[i].tQueue);
         }
 
-        free( test_info.tinfo );
+        free(test_info.tinfo);
     }
 
     return error;
 }
-
-
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index b170e095..d468d26d 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -32,64 +32,83 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* out2, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 iout = NAN;\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( iout, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       float3 iout = NAN;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = iout.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = iout.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* out2, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 iout = NAN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 iout = NAN;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -98,91 +117,114 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
 
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* out2, __global double* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       double3 iout = NAN;\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( iout, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       double3 iout = NAN;\n"
-                            "       double3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = iout.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = iout.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* out2, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       double3 iout = NAN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 iout = NAN;\n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -194,20 +236,20 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     uint32_t j, k;
     uint32_t l;
     int error;
-    char const * testing_mode;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    char const *testing_mode;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError0 = 0.0f;
     float maxError1 = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal0 = 0.0f;
     float maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
-    cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
-    int isFract = 0 == strcmp( "fract", f->nameInCode );
-    int skipNanInf = isFract  && ! gInfNanSupport;
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
+    int isFract = 0 == strcmp("fract", f->nameInCode);
+    int skipNanInf = isFract && !gInfNanSupport;
     float float_ulps = getAllowedUlpError(f, relaxedMode);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
@@ -215,222 +257,256 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-          for( j = 0; j < bufferSize / sizeof( float ); j++ )
-          {
-            p[j] = (uint32_t) i + j * scale;
-            if (relaxedMode && strcmp(f->name, "sincos") == 0)
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
-              float pj = *(float *)&p[j];
-              if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                p[j] = (uint32_t)i + j * scale;
+                if (relaxedMode && strcmp(f->name, "sincos") == 0)
+                {
+                    float pj = *(float *)&p[j];
+                    if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                }
             }
-          }
         }
         else
         {
-          for( j = 0; j < bufferSize / sizeof( float ); j++ )
-          {
-            p[j] = (uint32_t) i + j;
-            if (relaxedMode && strcmp(f->name, "sincos") == 0)
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
-              float pj = *(float *)&p[j];
-              if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                p[j] = (uint32_t)i + j;
+                if (relaxedMode && strcmp(f->name, "sincos") == 0)
+                {
+                    float pj = *(float *)&p[j];
+                    if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
+                }
             }
-          }
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
         FPU_mode_type oldMode;
         RoundingMode oldRoundMode = kRoundToNearestEven;
-        if( isFract )
+        if (isFract)
         {
-            //Calculate the correctly rounded reference result
-            memset( &oldMode, 0, sizeof( oldMode ) );
-            if( ftz )
-                ForceFTZ( &oldMode );
+            // Calculate the correctly rounded reference result
+            memset(&oldMode, 0, sizeof(oldMode));
+            if (ftz) ForceFTZ(&oldMode);
 
             // Set the rounding mode to match the device
             if (gIsInRTZMode)
                 oldRoundMode = set_round(kRoundTowardZero, kfloat);
         }
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         float *r2 = (float *)gOut_Ref2;
         float *s = (float *)gIn;
 
-        if( skipNanInf )
+        if (skipNanInf)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
                 double dd;
                 feclearexcept(FE_OVERFLOW);
 
                 if (relaxedMode)
-                    r[j] = (float) f->rfunc.f_fpf( s[j], &dd );
+                    r[j] = (float)f->rfunc.f_fpf(s[j], &dd);
                 else
-                    r[j] = (float) f->func.f_fpf( s[j], &dd );
+                    r[j] = (float)f->func.f_fpf(s[j], &dd);
 
-                r2[j] = (float) dd;
-                overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+                r2[j] = (float)dd;
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
             }
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
             {
                 double dd;
                 if (relaxedMode)
                     r[j] = (float)f->rfunc.f_fpf(s[j], &dd);
                 else
-                  r[j] = (float) f->func.f_fpf( s[j], &dd );
+                    r[j] = (float)f->func.f_fpf(s[j], &dd);
 
-                r2[j] = (float) dd;
+                r2[j] = (float)dd;
             }
         }
 
-        if( isFract && ftz )
-            RestoreFPState( &oldMode );
+        if (isFract && ftz) RestoreFPState(&oldMode);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
+        if (gSkipCorrectnessTesting)
         {
-            if (isFract && gIsInRTZMode)
-                (void)set_round(oldRoundMode, kfloat);
+            if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
             break;
         }
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         uint32_t *t2 = (uint32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)gOut[k];
                 uint32_t *q2 = (uint32_t *)gOut2[k];
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j]  )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
                     double correct, correct2;
                     float err, err2;
-                    float test = ((float*) q)[j];
-                    float test2 = ((float*) q2)[j];
+                    float test = ((float *)q)[j];
+                    float test2 = ((float *)q2)[j];
 
                     if (relaxedMode)
                         correct = f->rfunc.f_fpf(s[j], &correct2);
                     else
-                      correct = f->func.f_fpf( s[j], &correct2 );
+                        correct = f->func.f_fpf(s[j], &correct2);
 
-                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
                     if (relaxedMode || skipNanInf)
                     {
-                        if (skipNanInf && overflow[j])
-                            continue;
+                        if (skipNanInf && overflow[j]) continue;
 
-                        // Note: no double rounding here.  Reference functions calculate in single precision.
-                        if( IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
-                            IsFloatInfinity(correct2)|| IsFloatNaN(correct2)    ||
-                            IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        )
+                        // Note: no double rounding here.  Reference functions
+                        // calculate in single precision.
+                        if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                            || IsFloatInfinity(correct2) || IsFloatNaN(correct2)
+                            || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
                             continue;
                     }
 
-                    typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
+                    typedef int (*CheckForSubnormal)(
+                        double, float); // If we are in fast relaxed math, we
+                                        // have a different calculation for the
+                                        // subnormal threshold.
                     CheckForSubnormal isFloatResultSubnormalPtr;
                     if (relaxedMode)
                     {
-                      err = Abs_Error( test, correct);
-                      err2 = Abs_Error( test2, correct2);
-                      isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
+                        err = Abs_Error(test, correct);
+                        err2 = Abs_Error(test2, correct2);
+                        isFloatResultSubnormalPtr =
+                            &IsFloatResultSubnormalAbsError;
                     }
                     else
                     {
-                        err = Ulp_Error( test, correct );
-                        err2 = Ulp_Error( test2, correct2 );
+                        err = Ulp_Error(test, correct);
+                        err2 = Ulp_Error(test2, correct2);
                         isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
                     }
-                    int fail = ! (fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps);
+                    int fail = !(fabsf(err) <= float_ulps
+                                 && fabsf(err2) <= float_ulps);
 
-                    if( ftz )
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( (*isFloatResultSubnormalPtr)(correct, float_ulps) )
+                        if ((*isFloatResultSubnormalPtr)(correct, float_ulps))
                         {
-                            if( (*isFloatResultSubnormalPtr) (correct2, float_ulps ))
+                            if ((*isFloatResultSubnormalPtr)(correct2,
+                                                             float_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && test2 == 0.0f );
-                                if( ! fail )
+                                fail = fail && !(test == 0.0f && test2 == 0.0f);
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     err2 = 0.0f;
@@ -438,209 +514,251 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
                             }
                             else
                             {
-                                fail = fail && ! ( test == 0.0f && fabsf(err2) <= float_ulps);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && fabsf(err2) <= float_ulps);
+                                if (!fail) err = 0.0f;
                             }
                         }
-                        else if( (*isFloatResultSubnormalPtr)(correct2, float_ulps ) )
+                        else if ((*isFloatResultSubnormalPtr)(correct2,
+                                                              float_ulps))
                         {
-                            fail = fail && ! ( test2 == 0.0f && fabsf(err) <= float_ulps);
-                            if( ! fail )
-                                err2 = 0.0f;
+                            fail = fail
+                                && !(test2 == 0.0f && fabsf(err) <= float_ulps);
+                            if (!fail) err2 = 0.0f;
                         }
 
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
                             double correctp, correctn;
                             double correct2p, correct2n;
                             float errp, err2p, errn, err2n;
 
-                            if( skipNanInf )
-                                feclearexcept(FE_OVERFLOW);
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
                             if (relaxedMode)
                             {
-                              correctp = f->rfunc.f_fpf( 0.0, &correct2p );
-                              correctn = f->rfunc.f_fpf( -0.0, &correct2n );
+                                correctp = f->rfunc.f_fpf(0.0, &correct2p);
+                                correctn = f->rfunc.f_fpf(-0.0, &correct2n);
                             }
                             else
                             {
-                              correctp = f->func.f_fpf( 0.0, &correct2p );
-                              correctn = f->func.f_fpf( -0.0, &correct2n );
+                                correctp = f->func.f_fpf(0.0, &correct2p);
+                                correctn = f->func.f_fpf(-0.0, &correct2n);
                             }
 
-                            // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
-                            if( skipNanInf )
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (skipNanInf)
                             {
-                                if( fetestexcept(FE_OVERFLOW) )
-                                    continue;
+                                if (fetestexcept(FE_OVERFLOW)) continue;
 
-                                // Note: no double rounding here.  Reference functions calculate in single precision.
-                                if( IsFloatInfinity(correctp) || IsFloatNaN(correctp)   ||
-                                    IsFloatInfinity(correctn) || IsFloatNaN(correctn)   ||
-                                    IsFloatInfinity(correct2p) || IsFloatNaN(correct2p) ||
-                                    IsFloatInfinity(correct2n) || IsFloatNaN(correct2n) )
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correctp)
+                                    || IsFloatNaN(correctp)
+                                    || IsFloatInfinity(correctn)
+                                    || IsFloatNaN(correctn)
+                                    || IsFloatInfinity(correct2p)
+                                    || IsFloatNaN(correct2p)
+                                    || IsFloatInfinity(correct2n)
+                                    || IsFloatNaN(correct2n))
                                     continue;
                             }
 
                             if (relaxedMode)
                             {
-                              errp = Abs_Error( test, correctp  );
-                              err2p = Abs_Error( test, correct2p  );
-                              errn = Abs_Error( test, correctn  );
-                              err2n = Abs_Error( test, correct2n  );
+                                errp = Abs_Error(test, correctp);
+                                err2p = Abs_Error(test, correct2p);
+                                errn = Abs_Error(test, correctn);
+                                err2n = Abs_Error(test, correct2n);
                             }
                             else
                             {
-                              errp = Ulp_Error( test, correctp  );
-                              err2p = Ulp_Error( test, correct2p  );
-                              errn = Ulp_Error( test, correctn  );
-                              err2n = Ulp_Error( test, correct2n  );
+                                errp = Ulp_Error(test, correctp);
+                                err2p = Ulp_Error(test, correct2p);
+                                errn = Ulp_Error(test, correctn);
+                                err2n = Ulp_Error(test, correct2n);
                             }
 
-                            fail =  fail && ((!(fabsf(errp) <= float_ulps)) && (!(fabsf(err2p) <= float_ulps))    &&
-                                            ((!(fabsf(errn) <= float_ulps)) && (!(fabsf(err2n) <= float_ulps))) );
-                            if( fabsf( errp ) < fabsf(err ) )
-                                err = errp;
-                            if( fabsf( errn ) < fabsf(err ) )
-                                err = errn;
-                            if( fabsf( err2p ) < fabsf(err2 ) )
-                                err2 = err2p;
-                            if( fabsf( err2n ) < fabsf(err2 ) )
-                                err2 = err2n;
+                            fail = fail
+                                && ((!(fabsf(errp) <= float_ulps))
+                                    && (!(fabsf(err2p) <= float_ulps))
+                                    && ((!(fabsf(errn) <= float_ulps))
+                                        && (!(fabsf(err2n) <= float_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
 
                             // retry per section 6.5.3.4
-                            if(  (*isFloatResultSubnormalPtr)( correctp, float_ulps ) || (*isFloatResultSubnormalPtr)( correctn, float_ulps )  )
+                            if ((*isFloatResultSubnormalPtr)(correctp,
+                                                             float_ulps)
+                                || (*isFloatResultSubnormalPtr)(correctn,
+                                                                float_ulps))
                             {
-                              if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
-                              {
-                                fail = fail && !( test == 0.0f && test2 == 0.0f);
-                                if( ! fail )
-                                  err = err2 = 0.0f;
-                              }
-                              else
-                              {
-                                fail = fail && ! (test == 0.0f && fabsf(err2) <= float_ulps);
-                                if( ! fail )
-                                  err = 0.0f;
-                              }
+                                if ((*isFloatResultSubnormalPtr)(correct2p,
+                                                                 float_ulps)
+                                    || (*isFloatResultSubnormalPtr)(correct2n,
+                                                                    float_ulps))
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f && test2 == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
+                                }
+                                else
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && fabsf(err2) <= float_ulps);
+                                    if (!fail) err = 0.0f;
+                                }
                             }
-                            else if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
+                            else if ((*isFloatResultSubnormalPtr)(correct2p,
+                                                                  float_ulps)
+                                     || (*isFloatResultSubnormalPtr)(
+                                         correct2n, float_ulps))
                             {
-                                fail = fail && ! (test2 == 0.0f && (fabsf(err) <= float_ulps));
-                                if( ! fail )
-                                    err2 = 0.0f;
+                                fail = fail
+                                    && !(test2 == 0.0f
+                                         && (fabsf(err) <= float_ulps));
+                                if (!fail) err2 = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError0 )
+                    if (fabsf(err) > maxError0)
                     {
                         maxError0 = fabsf(err);
                         maxErrorVal0 = s[j];
                     }
-                    if( fabsf(err2 ) > maxError1 )
+                    if (fabsf(err2) > maxError1)
                     {
                         maxError1 = fabsf(err2);
                         maxErrorVal1 = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: {%f, %f} ulp error at %a: *{%a, %a} vs. {%a, %a}\n", f->name, sizeNames[k], err, err2, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((float*) gOut_Ref2)[j], test, test2 );
-                      error = -1;
-                      goto exit;
+                        vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: "
+                                   "*{%a, %a} vs. {%a, %a}\n",
+                                   f->name, sizeNames[k], err, err2,
+                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
+                                   ((float *)gOut_Ref2)[j], test, test2);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if (isFract && gIsInRTZMode)
-            (void)set_round(oldRoundMode, kfloat);
+        if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog(".");
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
 
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -654,16 +772,16 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError0 = 0.0f;
     float maxError1 = 0.0f;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal0 = 0.0f;
     double maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -672,135 +790,163 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j);
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         double *r2 = (double *)gOut_Ref2;
         double *s = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
         {
             long double dd;
-            r[j] = (double) f->dfunc.f_fpf( s[j], &dd );
-            r2[j] = (double) dd;
+            r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
+            r2[j] = (double)dd;
         }
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         uint64_t *t2 = (uint64_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
                 uint64_t *q2 = (uint64_t *)(gOut2[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j]  )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
-                    double test = ((double*) q)[j];
-                    double test2 = ((double*) q2)[j];
+                    double test = ((double *)q)[j];
+                    double test2 = ((double *)q2)[j];
                     long double correct2;
-                    long double correct = f->dfunc.f_fpf( s[j], &correct2 );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    float err2 = Bruteforce_Ulp_Error_Double( test2, correct2 );
-                    int fail = ! (fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps);
-                    if( ftz )
+                    long double correct = f->dfunc.f_fpf(s[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
+                    int fail = !(fabsf(err) <= f->double_ulps
+                                 && fabsf(err2) <= f->double_ulps);
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
                         {
-                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps))
                             {
-                                fail = fail && ! ( test == 0.0f && test2 == 0.0f );
-                                if( ! fail )
+                                fail = fail && !(test == 0.0f && test2 == 0.0f);
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     err2 = 0.0f;
@@ -808,168 +954,214 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
                             }
                             else
                             {
-                                fail = fail && ! ( test == 0.0f && fabsf(err2) <= f->double_ulps);
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && fabsf(err2) <= f->double_ulps);
+                                if (!fail) err = 0.0f;
                             }
                         }
-                        else if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
+                        else if (IsDoubleResultSubnormal(correct2,
+                                                         f->double_ulps))
                         {
-                            fail = fail && ! ( test2 == 0.0f && fabsf(err) <= f->double_ulps);
-                            if( ! fail )
-                                err2 = 0.0f;
+                            fail = fail
+                                && !(test2 == 0.0f
+                                     && fabsf(err) <= f->double_ulps);
+                            if (!fail) err2 = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
                             long double correct2p, correct2n;
-                            long double correctp = f->dfunc.f_fpf( 0.0, &correct2p );
-                            long double correctn = f->dfunc.f_fpf( -0.0, &correct2n );
-                            float errp = Bruteforce_Ulp_Error_Double( test, correctp  );
-                            float err2p = Bruteforce_Ulp_Error_Double( test, correct2p  );
-                            float errn = Bruteforce_Ulp_Error_Double( test, correctn  );
-                            float err2n = Bruteforce_Ulp_Error_Double( test, correct2n  );
-                            fail =  fail && ((!(fabsf(errp) <= f->double_ulps)) && (!(fabsf(err2p) <= f->double_ulps))    &&
-                                            ((!(fabsf(errn) <= f->double_ulps)) && (!(fabsf(err2n) <= f->double_ulps))) );
-                            if( fabsf( errp ) < fabsf(err ) )
-                                err = errp;
-                            if( fabsf( errn ) < fabsf(err ) )
-                                err = errn;
-                            if( fabsf( err2p ) < fabsf(err2 ) )
-                                err2 = err2p;
-                            if( fabsf( err2n ) < fabsf(err2 ) )
-                                err2 = err2n;
+                            long double correctp =
+                                f->dfunc.f_fpf(0.0, &correct2p);
+                            long double correctn =
+                                f->dfunc.f_fpf(-0.0, &correct2n);
+                            float errp =
+                                Bruteforce_Ulp_Error_Double(test, correctp);
+                            float err2p =
+                                Bruteforce_Ulp_Error_Double(test, correct2p);
+                            float errn =
+                                Bruteforce_Ulp_Error_Double(test, correctn);
+                            float err2n =
+                                Bruteforce_Ulp_Error_Double(test, correct2n);
+                            fail = fail
+                                && ((!(fabsf(errp) <= f->double_ulps))
+                                    && (!(fabsf(err2p) <= f->double_ulps))
+                                    && ((!(fabsf(errn) <= f->double_ulps))
+                                        && (!(fabsf(err2n)
+                                              <= f->double_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
 
                             // retry per section 6.5.3.4
-                            if( IsDoubleResultSubnormal( correctp, f->double_ulps ) || IsDoubleResultSubnormal( correctn, f->double_ulps ) )
+                            if (IsDoubleResultSubnormal(correctp,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correctn,
+                                                           f->double_ulps))
                             {
-                                if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
+                                if (IsDoubleResultSubnormal(correct2p,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct2n,
+                                                               f->double_ulps))
                                 {
-                                    fail = fail && !( test == 0.0f && test2 == 0.0f);
-                                    if( ! fail )
-                                        err = err2 = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f && test2 == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
                                 }
                                 else
                                 {
-                                    fail = fail && ! (test == 0.0f && fabsf(err2) <= f->double_ulps);
-                                    if( ! fail )
-                                        err = 0.0f;
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && fabsf(err2) <= f->double_ulps);
+                                    if (!fail) err = 0.0f;
                                 }
                             }
-                            else if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
+                            else if (IsDoubleResultSubnormal(correct2p,
+                                                             f->double_ulps)
+                                     || IsDoubleResultSubnormal(correct2n,
+                                                                f->double_ulps))
                             {
-                                fail = fail && ! (test2 == 0.0f && (fabsf(err) <= f->double_ulps));
-                                if( ! fail )
-                                    err2 = 0.0f;
+                                fail = fail
+                                    && !(test2 == 0.0f
+                                         && (fabsf(err) <= f->double_ulps));
+                                if (!fail) err2 = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError0 )
+                    if (fabsf(err) > maxError0)
                     {
                         maxError0 = fabsf(err);
                         maxErrorVal0 = s[j];
                     }
-                    if( fabsf(err2 ) > maxError1 )
+                    if (fabsf(err2) > maxError1)
                     {
                         maxError1 = fabsf(err2);
                         maxErrorVal1 = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: *{%.13la, %.13la} vs. {%.13la, %.13la}\n", f->name, sizeNames[k], err, err2, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((double*) gOut_Ref2)[j], test, test2 );
-                      error = -1;
-                      goto exit;
+                        vlog_error(
+                            "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
+                            "*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
+                            f->name, sizeNames[k], err, err2,
+                            ((double *)gIn)[j], ((double *)gOut_Ref)[j],
+                            ((double *)gOut_Ref2)[j], test, test2);
+                        error = -1;
+                        goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
-        double *p = (double*) gIn;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            p[j] = DoubleFromUInt32(genrand_int32(d) );
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        // Init input array
+        double *p = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
 
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -977,6 +1169,3 @@ exit:
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index 15326882..c71de0ed 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -34,63 +34,82 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
-                            "       int3 iout = INT_MIN;\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "       vstore3( iout, 0, out2 + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       int3 iout = INT_MIN;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( f0, &iout );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               out2[3*i+1] = iout.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               out2[3*i] = iout.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 iout = INT_MIN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 iout = INT_MIN;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -99,97 +118,120 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i], out2 + i );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in)\n"
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-                        "       int3 iout = INT_MIN;\n"
-                        "       f0 = ", name, "( f0, &iout );\n"
-                        "       vstore3( f0, 0, out + 3*i );\n"
-                        "       vstore3( iout, 0, out2 + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       int3 iout = INT_MIN;\n"
-                        "       double3 f0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       f0 = ", name, "( f0, &iout );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = f0.y; \n"
-                        "               out2[3*i+1] = iout.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = f0.x; \n"
-                        "               out2[3*i] = iout.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global int* out2, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 iout = INT_MIN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 iout = INT_MIN;\n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
 }
 
-cl_ulong  abs_cl_long( cl_long i );
-cl_ulong  abs_cl_long( cl_long i )
+cl_ulong abs_cl_long(cl_long i);
+cl_ulong abs_cl_long(cl_long i)
 {
     cl_long mask = i >> 63;
     return (i ^ mask) - mask;
@@ -200,22 +242,22 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     float float_ulps;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
-    cl_ulong  maxiError;
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    cl_ulong maxiError;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded )
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -225,147 +267,179 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j * scale;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         float *r = (float *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         float *s = (float *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = (float) f->func.f_fpI( s[j], r2+j );
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fpI(s[j], r2 + j);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)(gOut2[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j] )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
-                    float test = ((float*) q)[j];
+                    float test = ((float *)q)[j];
                     int correct2 = INT_MIN;
-                    double correct = f->func.f_fpI( s[j], &correct2 );
-                    float err = Ulp_Error( test, correct );
-                    cl_long iErr = (int64_t) q2[j] - (int64_t) correct2;
-                    int fail = ! (fabsf(err) <= float_ulps && abs_cl_long( iErr ) <= maxiError );
-                    if( ftz )
+                    double correct = f->func.f_fpI(s[j], &correct2);
+                    float err = Ulp_Error(test, correct);
+                    cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
+                    int fail = !(fabsf(err) <= float_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsFloatResultSubnormal(correct, float_ulps ) )
+                        if (IsFloatResultSubnormal(correct, float_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsFloatSubnormal( s[j] ) )
+                        if (IsFloatSubnormal(s[j]))
                         {
                             int correct5, correct6;
-                            double correct3 = f->func.f_fpI( 0.0, &correct5 );
-                            double correct4 = f->func.f_fpI( -0.0, &correct6 );
-                            float err2 = Ulp_Error( test, correct3  );
-                            float err3 = Ulp_Error( test, correct4  );
-                            cl_long iErr2 = (long long) q2[j] - (long long) correct5;
-                            cl_long iErr3 = (long long) q2[j] - (long long) correct6;
+                            double correct3 = f->func.f_fpI(0.0, &correct5);
+                            double correct4 = f->func.f_fpI(-0.0, &correct6);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            cl_long iErr2 =
+                                (long long)q2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)q2[j] - (long long)correct6;
 
                             // Did +0 work?
-                            if( fabsf(err2) <= float_ulps && abs_cl_long( iErr2 ) <= maxiError )
+                            if (fabsf(err2) <= float_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
                             {
                                 err = err2;
                                 iErr = iErr2;
                                 fail = 0;
                             }
                             // Did -0 work?
-                            else if(fabsf(err3) <= float_ulps && abs_cl_long( iErr3 ) <= maxiError)
+                            else if (fabsf(err3) <= float_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
                             {
                                 err = err3;
                                 iErr = iErr3;
@@ -373,10 +447,17 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
                             }
 
                             // retry per section 6.5.3.4
-                            if( fail && (IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps )) )
+                            if (fail
+                                && (IsFloatResultSubnormal(correct2, float_ulps)
+                                    || IsFloatResultSubnormal(correct3,
+                                                              float_ulps)))
                             {
-                                fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
-                                if( ! fail )
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     iErr = 0;
@@ -384,20 +465,24 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %s%s: {%f, %d} ulp error at %a: *{%a, %d} vs. {%a, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
+                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
+                                   "*{%a, %d} vs. {%a, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
+                                   ((int *)gOut_Ref2)[j], test, q2[j]);
                         error = -1;
                         goto exit;
                     }
@@ -405,88 +490,109 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -500,18 +606,18 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
-    cl_ulong  maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     uint64_t step = getTestStep(sizeof(double), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
@@ -520,151 +626,185 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( double ); j++ )
-                p[j] = DoubleFromUInt32((uint32_t) i + j);
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
 
             memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILED -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
+        // Calculate the correctly rounded reference result
         double *r = (double *)gOut_Ref;
         int *r2 = (int *)gOut_Ref2;
         double *s = (double *)gIn;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            r[j] = (double) f->dfunc.f_fpI( s[j], r2+j );
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fpI(s[j], r2 + j);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray2 failed %d\n", error );
+                vlog_error("ReadArray2 failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
-        //Verify data
+        // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         int32_t *t2 = (int32_t *)gOut_Ref2;
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint64_t *q = (uint64_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)(gOut2[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] || t2[j] != q2[j] )
+                if (t[j] != q[j] || t2[j] != q2[j])
                 {
-                    double test = ((double*) q)[j];
+                    double test = ((double *)q)[j];
                     int correct2 = INT_MIN;
-                    long double correct = f->dfunc.f_fpI( s[j], &correct2 );
-                    float err = Bruteforce_Ulp_Error_Double( test, correct );
-                    cl_long iErr = (long long) q2[j] - (long long) correct2;
-                    int fail = ! (fabsf(err) <= f->double_ulps && abs_cl_long( iErr ) <= maxiError );
-                    if( ftz )
+                    long double correct = f->dfunc.f_fpI(s[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    cl_long iErr = (long long)q2[j] - (long long)correct2;
+                    int fail = !(fabsf(err) <= f->double_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
                         {
-                            fail = fail && ! ( test == 0.0f && iErr == 0 );
-                            if( ! fail )
-                                err = 0.0f;
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
                         }
 
                         // retry per section 6.5.3.3
-                        if( IsDoubleSubnormal( s[j] ) )
+                        if (IsDoubleSubnormal(s[j]))
                         {
                             int correct5, correct6;
-                            long double correct3 = f->dfunc.f_fpI( 0.0, &correct5 );
-                            long double correct4 = f->dfunc.f_fpI( -0.0, &correct6 );
-                            float err2 = Bruteforce_Ulp_Error_Double( test, correct3  );
-                            float err3 = Bruteforce_Ulp_Error_Double( test, correct4  );
-                            cl_long iErr2 = (long long) q2[j] - (long long) correct5;
-                            cl_long iErr3 = (long long) q2[j] - (long long) correct6;
+                            long double correct3 =
+                                f->dfunc.f_fpI(0.0, &correct5);
+                            long double correct4 =
+                                f->dfunc.f_fpI(-0.0, &correct6);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            cl_long iErr2 =
+                                (long long)q2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)q2[j] - (long long)correct6;
 
                             // Did +0 work?
-                            if( fabsf(err2) <= f->double_ulps && abs_cl_long( iErr2 ) <= maxiError )
+                            if (fabsf(err2) <= f->double_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
                             {
                                 err = err2;
                                 iErr = iErr2;
                                 fail = 0;
                             }
                             // Did -0 work?
-                            else if(fabsf(err3) <= f->double_ulps && abs_cl_long( iErr3 ) <= maxiError)
+                            else if (fabsf(err3) <= f->double_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
                             {
                                 err = err3;
                                 iErr = iErr3;
@@ -672,10 +812,18 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
                             }
 
                             // retry per section 6.5.3.4
-                            if( fail && (IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )) )
+                            if (fail
+                                && (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)))
                             {
-                                fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
-                                if( ! fail )
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
                                 {
                                     err = 0.0f;
                                     iErr = 0;
@@ -683,20 +831,24 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( llabs(iErr) > maxError2 )
+                    if (llabs(iErr) > maxError2)
                     {
-                        maxError2 = llabs(iErr );
+                        maxError2 = llabs(iErr);
                         maxErrorVal2 = s[j];
                     }
 
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\nERROR: %sD%s: {%f, %d} ulp error at %.13la: *{%.13la, %d} vs. {%.13la, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
+                        vlog_error("\nERROR: %sD%s: {%f, %d} ulp error at "
+                                   "%.13la: *{%.13la, %d} vs. {%.13la, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   ((double *)gIn)[j], ((double *)gOut_Ref)[j],
+                                   ((int *)gOut_Ref2)[j], test, q2[j]);
                         error = -1;
                         goto exit;
                     }
@@ -704,91 +856,111 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
+        // Init input array
         double *p = (double *)gIn;
 
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        for (j = 0; j < bufferSize / sizeof(double); j++)
             p[j] = DoubleFromUInt32(genrand_int32(d));
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -796,6 +968,3 @@ exit:
 
     return error;
 }
-
-
-
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 97fd25f9..397ff877 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -33,61 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                        cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global uint", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global uint* in)\n"
-                            "{\n"
-                            "   size_t i = get_global_id(0);\n"
-                            "   if( i + 1 < get_global_size(0) )\n"
-                            "   {\n"
-                            "       uint3 u0 = vload3( 0, in + 3 * i );\n"
-                            "       float3 f0 = ", name, "( u0 );\n"
-                            "       vstore3( f0, 0, out + 3*i );\n"
-                            "   }\n"
-                            "   else\n"
-                            "   {\n"
-                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                            "       uint3 u0;\n"
-                            "       float3 f0;\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 1:\n"
-                            "               u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n"
-                            "               break;\n"
-                            "           case 0:\n"
-                            "               u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
-                            "               break;\n"
-                            "       }\n"
-                            "       f0 = ", name, "( u0 );\n"
-                            "       switch( parity )\n"
-                            "       {\n"
-                            "           case 0:\n"
-                            "               out[3*i+1] = f0.y; \n"
-                            "               // fall through\n"
-                            "           case 1:\n"
-                            "               out[3*i] = f0.x; \n"
-                            "               break;\n"
-                            "       }\n"
-                            "   }\n"
-                            "}\n"
-                        };
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global uint",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global uint* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       uint3 u0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       uint3 u0;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( u0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
@@ -95,90 +111,110 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
-    const char *c[] = {
-                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global ulong", sizeNames[vectorSize], "* in)\n"
-                            "{\n"
-                            "   int i = get_global_id(0);\n"
-                            "   out[i] = ", name, "( in[i] );\n"
-                            "}\n"
-                        };
-
-    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global ulong* in)\n"
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global ulong",
+                        sizeNames[vectorSize],
+                        "* in)\n"
                         "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   if( i + 1 < get_global_size(0) )\n"
-                        "   {\n"
-                        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
-                        "       double3 f0 = ", name, "( u0 );\n"
-                        "       vstore3( f0, 0, out + 3*i );\n"
-                        "   }\n"
-                        "   else\n"
-                        "   {\n"
-                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
-                        "       ulong3 u0;\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 1:\n"
-                        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
-                        "               break;\n"
-                        "           case 0:\n"
-                        "               u0 = (ulong3)( in[3*i], in[3*i+1], 0xdeaddeaddeaddeadUL ); \n"
-                        "               break;\n"
-                        "       }\n"
-                        "       double3 f0 = ", name, "( u0 );\n"
-                        "       switch( parity )\n"
-                        "       {\n"
-                        "           case 0:\n"
-                        "               out[3*i+1] = f0.y; \n"
-                        "               // fall through\n"
-                        "           case 1:\n"
-                        "               out[3*i] = f0.x; \n"
-                        "               break;\n"
-                        "       }\n"
-                        "   }\n"
-                        "}\n"
-                    };
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                         "__kernel void math_kernel",
+                         sizeNames[vectorSize],
+                         "( __global double* out, __global ulong* in)\n"
+                         "{\n"
+                         "   size_t i = get_global_id(0);\n"
+                         "   if( i + 1 < get_global_size(0) )\n"
+                         "   {\n"
+                         "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
+                         "       double3 f0 = ",
+                         name,
+                         "( u0 );\n"
+                         "       vstore3( f0, 0, out + 3*i );\n"
+                         "   }\n"
+                         "   else\n"
+                         "   {\n"
+                         "       size_t parity = i & 1;   // Figure out how "
+                         "many elements are left over after BUFFER_SIZE % "
+                         "(3*sizeof(float)). Assume power of two buffer size \n"
+                         "       ulong3 u0;\n"
+                         "       switch( parity )\n"
+                         "       {\n"
+                         "           case 1:\n"
+                         "               u0 = (ulong3)( in[3*i], "
+                         "0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
+                         "               break;\n"
+                         "           case 0:\n"
+                         "               u0 = (ulong3)( in[3*i], in[3*i+1], "
+                         "0xdeaddeaddeaddeadUL ); \n"
+                         "               break;\n"
+                         "       }\n"
+                         "       double3 f0 = ",
+                         name,
+                         "( u0 );\n"
+                         "       switch( parity )\n"
+                         "       {\n"
+                         "           case 0:\n"
+                         "               out[3*i+1] = f0.y; \n"
+                         "               // fall through\n"
+                         "           case 1:\n"
+                         "               out[3*i] = f0.x; \n"
+                         "               break;\n"
+                         "       }\n"
+                         "   }\n"
+                         "}\n" };
 
     const char **kern = c;
-    size_t kernSize = sizeof(c)/sizeof(c[0]);
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
 
-    if( sizeValues[vectorSize] == 3 )
+    if (sizeValues[vectorSize] == 3)
     {
         kern = c3;
-        kernSize = sizeof(c3)/sizeof(c3[0]);
+        kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
 
     char testName[32];
-    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
 
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
 typedef struct BuildKernelInfo
 {
-    cl_uint     offset;            // the first vector size to build
-    cl_kernel   *kernels;
-    cl_program  *programs;
-    const char  *nameInCode;
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-}BuildKernelInfo;
+} BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p);
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernel(info->nameInCode, i, info->kernels + i,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
-static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p);
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
 {
-    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
     cl_uint i = info->offset + job_id;
     return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
                              info->programs + i, info->relaxedMode);
@@ -189,22 +225,22 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
 
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
     int isRangeLimited = 0;
     float float_ulps;
     float half_sin_cos_tan_limit = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
-    if( gIsEmbedded)
+    if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
@@ -212,240 +248,282 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
         return error;
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i,
+       programs + i) ) ) return error;
+    */
 
-    if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") )
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
     {
         isRangeLimited = 1;
-        half_sin_cos_tan_limit = 1.0f + float_ulps * (FLT_EPSILON/2.0f);             // out of range results from finite inputs must be in [-1,1]
+        half_sin_cos_tan_limit = 1.0f
+            + float_ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
     }
-    else if( 0 == strcmp( f->name, "half_tan"))
+    else if (0 == strcmp(f->name, "half_tan"))
     {
         isRangeLimited = 1;
-        half_sin_cos_tan_limit = INFINITY;             // out of range resut from finite inputs must be numeric
+        half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
     }
 
 
-    for( i = 0; i < (1ULL<<32); i += step  )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         uint32_t *p = (uint32_t *)gIn;
-        if( gWimpyMode )
+        if (gWimpyMode)
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j * scale;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
         }
         else
         {
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                p[j] = (uint32_t) i + j;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
         }
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILURE -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILURE -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
-        float *r = (float*) gOut_Ref;
-        cl_uint *s = (cl_uint*) gIn;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
-            r[j] = (float) f->func.f_u( s[j] );
+        // Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        cl_uint *s = (cl_uint *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_u(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
 
-        //Verify data
-        uint32_t *t = (uint32_t*) gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
-                uint32_t *q = (uint32_t*)(gOut[k]);
+                uint32_t *q = (uint32_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    float test = ((float*) q)[j];
-                    double correct = f->func.f_u( s[j] );
-                    float err = Ulp_Error( test, correct );
-                    int fail = ! (fabsf(err) <= float_ulps);
+                    float test = ((float *)q)[j];
+                    double correct = f->func.f_u(s[j]);
+                    float err = Ulp_Error(test, correct);
+                    int fail = !(fabsf(err) <= float_ulps);
 
                     // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                    if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
+                    if (isRangeLimited
+                        && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
+                        && fabsf(s[j]) < INFINITY)
                     {
-                        if( fabsf( test ) <= half_sin_cos_tan_limit )
+                        if (fabsf(test) <= half_sin_cos_tan_limit)
                         {
                             err = 0;
                             fail = 0;
                         }
                     }
 
-                     if( fail )
+                    if (fail)
                     {
-                        if( ftz )
+                        if (ftz)
                         {
                             // retry per section 6.5.3.2
-                            if( IsFloatResultSubnormal(correct, float_ulps) )
+                            if (IsFloatResultSubnormal(correct, float_ulps))
                             {
-                                fail = fail && ( test != 0.0f );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", f->name, sizeNames[k], err, ((uint32_t*) gIn)[j], ((float*) gOut_Ref)[j], test );
-                      error = -1;
+                        vlog_error(
+                            "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n",
+                            f->name, sizeNames[k], err, ((uint32_t *)gIn)[j],
+                            ((float *)gOut_Ref)[j], test);
+                        error = -1;
                         goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
-        uint32_t *p = (uint32_t*)gIn;
-        if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
-                ((float*)p)[j] = (float) genrand_real1(d);
-        else if( strstr( f->name, "log" ) )
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if (strstr(f->name, "exp") || strstr(f->name, "sin")
+            || strstr(f->name, "cos") || strstr(f->name, "tan"))
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                ((float *)p)[j] = (float)genrand_real1(d);
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < bufferSize / sizeof(float); j++)
                 p[j] = genrand_int32(d) & 0x7fffffff;
         else
-            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            for (j = 0; j < bufferSize / sizeof(float); j++)
                 p[j] = genrand_int32(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILURE -- could not execute kernel\n" );
+                    vlog_error("FAILURE -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -454,9 +532,9 @@ exit:
     return error;
 }
 
-static cl_ulong random64( MTdata d )
+static cl_ulong random64(MTdata d)
 {
-    return (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
+    return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
 }
 
 int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
@@ -464,12 +542,12 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
-    cl_program programs[ VECTOR_SIZE_COUNT ];
-    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ;
     double maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
@@ -479,211 +557,243 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
-    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
-                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                &build_info ) ))
+    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
     {
         return error;
     }
-/*
-    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
-        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
-            return error;
-*/
+    /*
+        for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+            if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels +
+       i, programs + i) ) ) return error;
+    */
 
-    for( i = 0; i < (1ULL<<32); i += step  )
+    for (i = 0; i < (1ULL << 32); i += step)
     {
-        //Init input array
+        // Init input array
         cl_ulong *p = (cl_ulong *)gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_ulong ); j++ )
-            p[j] = random64(d);
+        for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
         // write garbage into output arrays
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
-            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
                 goto exit;
             }
         }
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
-
-            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
             {
-                vlog_error( "FAILURE -- could not execute kernel\n" );
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILURE -- could not execute kernel\n");
                 goto exit;
             }
         }
 
         // Get that moving
-        if( (error = clFlush(gQueue) ))
-            vlog( "clFlush failed\n" );
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
-        //Calculate the correctly rounded reference result
-        double *r = (double*) gOut_Ref;
-        cl_ulong *s = (cl_ulong*) gIn;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
-            r[j] = (double) f->dfunc.f_u( s[j] );
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        cl_ulong *s = (cl_ulong *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            r[j] = (double)f->dfunc.f_u(s[j]);
 
         // Read the data back
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
-                vlog_error( "ReadArray failed %d\n", error );
+                vlog_error("ReadArray failed %d\n", error);
                 goto exit;
             }
         }
 
-        if( gSkipCorrectnessTesting )
-            break;
+        if (gSkipCorrectnessTesting) break;
 
 
-        //Verify data
-        uint64_t *t = (uint64_t*) gOut_Ref;
-        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
         {
-            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
-                uint64_t *q = (uint64_t*)(gOut[k]);
+                uint64_t *q = (uint64_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
-                if( t[j] != q[j] )
+                if (t[j] != q[j])
                 {
-                    double test = ((double*) q)[j];
-                    long double correct = f->dfunc.f_u( s[j] );
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_u(s[j]);
                     float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    int fail = ! (fabsf(err) <= f->double_ulps);
+                    int fail = !(fabsf(err) <= f->double_ulps);
 
                     // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                    if( fail )
+                    if (fail)
                     {
-                        if( ftz )
+                        if (ftz)
                         {
                             // retry per section 6.5.3.2
-                            if( IsDoubleResultSubnormal(correct, f->double_ulps) )
+                            if (IsDoubleResultSubnormal(correct,
+                                                        f->double_ulps))
                             {
-                                fail = fail && ( test != 0.0 );
-                                if( ! fail )
-                                    err = 0.0f;
+                                fail = fail && (test != 0.0);
+                                if (!fail) err = 0.0f;
                             }
                         }
                     }
-                    if( fabsf(err ) > maxError )
+                    if (fabsf(err) > maxError)
                     {
                         maxError = fabsf(err);
                         maxErrorVal = s[j];
                     }
-                    if( fail )
+                    if (fail)
                     {
-                        vlog_error( "\n%s%sD: %f ulp error at 0x%16.16llx: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, ((uint64_t*) gIn)[j], ((double*) gOut_Ref)[j], test );
-                      error = -1;
+                        vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
+                                   "*%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err,
+                                   ((uint64_t *)gIn)[j],
+                                   ((double *)gOut_Ref)[j], test);
+                        error = -1;
                         goto exit;
                     }
                 }
             }
         }
 
-        if( 0 == (i & 0x0fffffff) )
+        if (0 == (i & 0x0fffffff))
         {
-           if (gVerboseBruteForce)
-           {
-               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
-           } else
-           {
-              vlog("." );
-           }
-           fflush(stdout);
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
         }
     }
 
-    if( ! gSkipCorrectnessTesting )
+    if (!gSkipCorrectnessTesting)
     {
-        if( gWimpyMode )
-            vlog( "Wimp pass" );
+        if (gWimpyMode)
+            vlog("Wimp pass");
         else
-            vlog( "passed" );
+            vlog("passed");
     }
 
-    if( gMeasureTimes )
+    if (gMeasureTimes)
     {
-        //Init input array
-        double *p = (double*) gIn;
+        // Init input array
+        double *p = (double *)gIn;
 
-        for( j = 0; j < bufferSize / sizeof( double ); j++ )
-            p[j] = random64(d);
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
-            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
 
 
         // Run the kernels
-        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_double);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
-            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
 
             double sum = 0.0;
             double bestTime = INFINITY;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
                 {
-                    vlog_error( "FAILURE -- could not execute kernel\n" );
+                    vlog_error("FAILURE -- could not execute kernel\n");
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
-            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
         }
-        for( ; j < gMaxVectorSizeIndex; j++ )
-            vlog( "\t     -- " );
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
     }
 
-    if( ! gSkipCorrectnessTesting )
-        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
-    vlog( "\n" );
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
 
 exit:
     // Release
-    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
     {
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
@@ -691,4 +801,3 @@ exit:
 
     return error;
 }
-