From ea49084976274951d6108dc54638981837f7cce1 Mon Sep 17 00:00:00 2001
From: Steven Winston <gpx1000@users.noreply.github.com>
Date: Tue, 24 Jan 2023 08:53:18 -0800
Subject: [PATCH] Conversions (#1555)

* grab latest from upstream OpenCL

* Removed events for host to device data transfers

* grab latest from upstream OpenCL

* 1.) revert changes to CMakeLists.txt and run_batch script in hopes this will solve the CI issues for the PR.
2.) resolve the merge conflict in test_conversions.cpp

* 1.) resolve 2 additional merge conflicts

* 1.) resolve 1 additional merge conflicts

* locally this fails clang-format but CI version seems to require it.

* fix the warning.

* Remove now-unused event

Also, cut down a comment that is no longer accurate.

Co-authored-by: Joshua Luceno <joshualuceno@gmail.com>
Co-authored-by: Chip Davis <chip@holochip.com>
Co-authored-by: Chip Davis <cdavis5x@gmail.com>
---
 .../conversions/test_conversions.cpp          | 1422 +++++++++--------
 1 file changed, 753 insertions(+), 669 deletions(-)

diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index f7a21f21..63940455 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -23,7 +23,7 @@
 #include <sys/sysctl.h>
 #endif
 
-#if defined( __linux__ )
+#if defined(__linux__)
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/sysctl.h>
@@ -53,7 +53,7 @@
 #include "Sleep.h"
 #include "basic_test_conversions.h"
 
-#if (defined(_WIN32) && defined (_MSC_VER))
+#if (defined(_WIN32) && defined(_MSC_VER))
 // need for _controlfp_s and rouinding modes in RoundingMode
 #include "harness/testHarness.h"
 #endif
@@ -61,68 +61,73 @@
 #pragma mark -
 #pragma mark globals
 
-#define BUFFER_SIZE     (1024*1024)
-#define kPageSize       4096
+#define BUFFER_SIZE (1024 * 1024)
+#define kPageSize 4096
 #define EMBEDDED_REDUCTION_FACTOR 16
 #define PERF_LOOP_COUNT 100
 
-#define      kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
+#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
 
 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
 #include "fplib.h"
-    extern bool            qcom_sat;
-    extern roundingMode    qcom_rm;
+extern bool qcom_sat;
+extern roundingMode qcom_rm;
 #endif
 
-const char **   argList = NULL;
-int             argCount = 0;
-cl_context      gContext = NULL;
-cl_command_queue      gQueue = NULL;
-char            appName[64] = "ctest";
-int             gStartTestNumber = -1;
-int             gEndTestNumber = 0;
-#if defined( __APPLE__ )
-int             gTimeResults = 1;
+const char **argList = NULL;
+int argCount = 0;
+cl_context gContext = NULL;
+cl_command_queue gQueue = NULL;
+char appName[64] = "ctest";
+int gStartTestNumber = -1;
+int gEndTestNumber = 0;
+#if defined(__APPLE__)
+int gTimeResults = 1;
 #else
-int             gTimeResults = 0;
+int gTimeResults = 0;
 #endif
-int             gReportAverageTimes = 0;
-void            *gIn = NULL;
-void            *gRef = NULL;
-void        *gAllowZ = NULL;
-void            *gOut[ kCallStyleCount ] = { NULL };
-cl_mem          gInBuffer;
-cl_mem          gOutBuffers[ kCallStyleCount ];
-size_t          gComputeDevices = 0;
-uint32_t        gDeviceFrequency = 0;
-int             gWimpyMode = 0;
-int             gWimpyReductionFactor = 128;
-int             gSkipTesting = 0;
-int             gForceFTZ = 0;
-int             gMultithread = 1;
-int             gIsRTZ = 0;
-uint32_t        gSimdSize = 1;
-int             gHasDouble = 0;
-int             gTestDouble = 1;
-const char *    sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
-const int       vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
-int             gMinVectorSize = 0;
-int             gMaxVectorSize = sizeof(vectorSizes) / sizeof( vectorSizes[0] );
-static MTdata   gMTdata;
+int gReportAverageTimes = 0;
+void *gIn = NULL;
+void *gRef = NULL;
+void *gAllowZ = NULL;
+void *gOut[kCallStyleCount] = { NULL };
+cl_mem gInBuffer;
+cl_mem gOutBuffers[kCallStyleCount];
+size_t gComputeDevices = 0;
+uint32_t gDeviceFrequency = 0;
+int gWimpyMode = 0;
+int gWimpyReductionFactor = 128;
+int gSkipTesting = 0;
+int gForceFTZ = 0;
+int gMultithread = 1;
+int gIsRTZ = 0;
+uint32_t gSimdSize = 1;
+int gHasDouble = 0;
+int gTestDouble = 1;
+const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
+const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
+int gMinVectorSize = 0;
+int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
+static MTdata gMTdata;
 
 #pragma mark -
 #pragma mark Declarations
 
-static int ParseArgs( int argc, const char **argv );
-static void PrintUsage( void );
-test_status InitCL( cl_device_id device );
-static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round );
-static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d );
-static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel );
-static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount );
+static int ParseArgs(int argc, const char **argv);
+static void PrintUsage(void);
+test_status InitCL(cl_device_id device);
+static int GetTestCase(const char *name, Type *outType, Type *inType,
+                       SaturationMode *sat, RoundingMode *round);
+static int DoTest(cl_device_id device, Type outType, Type inType,
+                  SaturationMode sat, RoundingMode round, MTdata d);
+static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
+                              RoundingMode round, int vectorSize,
+                              cl_kernel *outKernel);
+static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
+                     size_t blockCount);
 
-void *FlushToZero( void );
-void UnFlushToZero( void *);
+void *FlushToZero(void);
+void UnFlushToZero(void *);
 
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
@@ -139,15 +144,16 @@ static inline void Force64BitFPUPrecision(void)
     // divergent code just use inline assembly which works for both.
     unsigned short int orig_cw = 0;
     unsigned short int new_cw = 0;
-    __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
-    new_cw = orig_cw | 0x0300;   // set precision to 64-bit
-    __asm__ __volatile__ ("fldcw  %0"::"m" (new_cw));
+    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
+    new_cw = orig_cw | 0x0300; // set precision to 64-bit
+    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
 #else
     /* Implement for other platforms if needed */
 #endif
 }
 
-int test_conversions( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
+int test_conversions(cl_device_id device, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
     int error, i, testNumber = -1;
     int startMinVectorSize = gMinVectorSize;
@@ -155,109 +161,148 @@ int test_conversions( cl_device_id device, cl_context context, cl_command_queue
     RoundingMode round;
     SaturationMode sat;
 
-    if( argCount )
+    if (argCount)
     {
-        for( i = 0; i < argCount; i++ )
+        for (i = 0; i < argCount; i++)
         {
-            if( GetTestCase( argList[i], &outType, &inType, &sat, &round ) )
+            if (GetTestCase(argList[i], &outType, &inType, &sat, &round))
             {
-                vlog_error( "\n\t\t**** ERROR:  Unable to parse function name %s.  Skipping....  *****\n\n", argList[i] );
+                vlog_error("\n\t\t**** ERROR:  Unable to parse function name "
+                           "%s.  Skipping....  *****\n\n",
+                           argList[i]);
                 continue;
             }
 
             // skip double if we don't have it
-            if( !gTestDouble && (inType == kdouble || outType == kdouble ) )
+            if (!gTestDouble && (inType == kdouble || outType == kdouble))
             {
-                if( gHasDouble )
+                if (gHasDouble)
                 {
-                    vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
-                    vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
+                    vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                               gTypeNames[outType], gSaturationNames[sat],
+                               gRoundingModeNames[round], gTypeNames[inType]);
+                    vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
+                         "off.\n");
                 }
 
                 continue;
             }
 
             // skip longs on embedded
-            if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) )
+            if (!gHasLong
+                && (inType == klong || outType == klong || inType == kulong
+                    || outType == kulong))
             {
                 continue;
             }
 
-            // Skip the implicit converts if the rounding mode is not default or test is saturated
-            if( 0 == startMinVectorSize )
+            // Skip the implicit converts if the rounding mode is not default or
+            // test is saturated
+            if (0 == startMinVectorSize)
             {
-                if( sat || round != kDefaultRoundingMode )
+                if (sat || round != kDefaultRoundingMode)
                     gMinVectorSize = 1;
                 else
                     gMinVectorSize = 0;
             }
 
-            if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) )
+            if ((error = DoTest(device, outType, inType, sat, round, gMTdata)))
             {
-                vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] );
+                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                           gTypeNames[outType], gSaturationNames[sat],
+                           gRoundingModeNames[round], gTypeNames[inType]);
             }
         }
     }
     else
     {
-        for( outType = (Type)0; outType < kTypeCount; outType = (Type)(outType+1) )
+        for (outType = (Type)0; outType < kTypeCount;
+             outType = (Type)(outType + 1))
         {
-            for( inType = (Type)0; inType < kTypeCount; inType = (Type)(inType+1) )
+            for (inType = (Type)0; inType < kTypeCount;
+                 inType = (Type)(inType + 1))
             {
                 // skip longs on embedded
-                if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) )
+                if (!gHasLong
+                    && (inType == klong || outType == klong || inType == kulong
+                        || outType == kulong))
                 {
                     continue;
                 }
 
-                for( sat = (SaturationMode)0; sat < kSaturationModeCount; sat = (SaturationMode)(sat+1) )
+                for (sat = (SaturationMode)0; sat < kSaturationModeCount;
+                     sat = (SaturationMode)(sat + 1))
                 {
-                    //skip illegal saturated conversions to float type
-                    if( kSaturated == sat && ( outType == kfloat || outType == kdouble ) )
+                    // skip illegal saturated conversions to float type
+                    if (kSaturated == sat
+                        && (outType == kfloat || outType == kdouble))
                     {
                         continue;
                     }
 
-                    for( round = (RoundingMode)0; round < kRoundingModeCount; round = (RoundingMode)(round+1) )
+                    for (round = (RoundingMode)0; round < kRoundingModeCount;
+                         round = (RoundingMode)(round + 1))
                     {
-                        if( ++testNumber < gStartTestNumber )
+                        if (++testNumber < gStartTestNumber)
                         {
-                            //     vlog( "%d) skipping convert_%sn%s%s( %sn )\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
+                            //     vlog( "%d) skipping convert_%sn%s%s( %sn
+                            //     )\n", testNumber, gTypeNames[ outType ],
+                            //     gSaturationNames[ sat ],
+                            //     gRoundingModeNames[round], gTypeNames[inType]
+                            //     );
                             continue;
                         }
                         else
                         {
-                            if( gEndTestNumber > 0 && testNumber >= gEndTestNumber  )
+                            if (gEndTestNumber > 0
+                                && testNumber >= gEndTestNumber)
                             {
                                 goto exit;
                             }
                         }
 
-                        vlog( "%d) Testing convert_%sn%s%s( %sn ):\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
+                        vlog("%d) Testing convert_%sn%s%s( %sn ):\n",
+                             testNumber, gTypeNames[outType],
+                             gSaturationNames[sat], gRoundingModeNames[round],
+                             gTypeNames[inType]);
 
                         // skip double if we don't have it
-                        if( ! gTestDouble && (inType == kdouble || outType == kdouble ) )
+                        if (!gTestDouble
+                            && (inType == kdouble || outType == kdouble))
                         {
-                            if( gHasDouble )
+                            if (gHasDouble)
                             {
-                                vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
-                                vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
+                                vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
+                                           "FAILED ** \n",
+                                           testNumber, gTypeNames[outType],
+                                           gSaturationNames[sat],
+                                           gRoundingModeNames[round],
+                                           gTypeNames[inType]);
+                                vlog("\t\tcl_khr_fp64 enabled, but double "
+                                     "testing turned off.\n");
                             }
                             continue;
                         }
 
-                        // Skip the implicit converts if the rounding mode is not default or test is saturated
-                        if( 0 == startMinVectorSize )
+                        // Skip the implicit converts if the rounding mode is
+                        // not default or test is saturated
+                        if (0 == startMinVectorSize)
                         {
-                            if( sat || round != kDefaultRoundingMode )
+                            if (sat || round != kDefaultRoundingMode)
                                 gMinVectorSize = 1;
                             else
                                 gMinVectorSize = 0;
                         }
 
-                        if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) )
+                        if ((error = DoTest(device, outType, inType, sat, round,
+                                            gMTdata)))
                         {
-                            vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] );
+                            vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
+                                       "FAILED ** \n",
+                                       testNumber, gTypeNames[outType],
+                                       gSaturationNames[sat],
+                                       gRoundingModeNames[round],
+                                       gTypeNames[inType]);
                         }
                     }
                 }
@@ -270,17 +315,17 @@ exit:
 }
 
 test_definition test_list[] = {
-    ADD_TEST( conversions ),
+    ADD_TEST(conversions),
 };
 
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 
 #pragma mark -
 
-int main (int argc, const char **argv )
+int main(int argc, const char **argv)
 {
     int error;
-    cl_uint seed = (cl_uint) time( NULL );
+    cl_uint seed = (cl_uint)time(NULL);
 
     argc = parseCustomParam(argc, argv);
     if (argc == -1)
@@ -288,15 +333,13 @@ int main (int argc, const char **argv )
         return 1;
     }
 
-    if( (error = ParseArgs( argc, argv )) )
-        return error;
+    if ((error = ParseArgs(argc, argv))) return error;
 
-    //Turn off sleep so our tests run to completion
+    // Turn off sleep so our tests run to completion
     PreventSleep();
-    atexit( ResumeSleep );
+    atexit(ResumeSleep);
 
-    if(!gMultithread)
-        SetThreadCount(1);
+    if (!gMultithread) SetThreadCount(1);
 
 #if defined(_MSC_VER) && defined(_M_IX86)
     // VS2005 (and probably others, since long double got deprecated) sets
@@ -308,14 +351,15 @@ int main (int argc, const char **argv )
     _controlfp_s(&ignored, _PC_64, _MCW_PC);
 #endif
 
-    vlog( "===========================================================\n" );
-    vlog( "Random seed: %u\n", seed );
-    gMTdata = init_genrand( seed );
+    vlog("===========================================================\n");
+    vlog("Random seed: %u\n", seed);
+    gMTdata = init_genrand(seed);
 
-    const char* arg[] = {argv[0]};
-    int ret = runTestHarnessWithCheck( 1, arg, test_num, test_list, true, 0, InitCL );
+    const char *arg[] = { argv[0] };
+    int ret =
+        runTestHarnessWithCheck(1, arg, test_num, test_list, true, 0, InitCL);
 
-    free_mtdata( gMTdata );
+    free_mtdata(gMTdata);
     if (gQueue)
     {
         error = clFinish(gQueue);
@@ -324,7 +368,8 @@ int main (int argc, const char **argv )
 
     clReleaseMemObject(gInBuffer);
 
-    for( int i = 0; i < kCallStyleCount; i++ ) {
+    for (int i = 0; i < kCallStyleCount; i++)
+    {
         clReleaseMemObject(gOutBuffers[i]);
     }
     clReleaseCommandQueue(gQueue);
@@ -336,82 +381,67 @@ int main (int argc, const char **argv )
 #pragma mark -
 #pragma mark setup
 
-static int ParseArgs( int argc, const char **argv )
+static int ParseArgs(int argc, const char **argv)
 {
     int i;
     argList = (const char **)calloc(argc, sizeof(char *));
     argCount = 0;
 
-    if( NULL == argList && argc > 1 )
-        return -1;
+    if (NULL == argList && argc > 1) return -1;
 
-#if (defined( __APPLE__ ) || defined(__linux__) || defined (__MINGW32__))
+#if (defined(__APPLE__) || defined(__linux__) || defined(__MINGW32__))
     { // Extract the app name
-        char baseName[ MAXPATHLEN ];
-        strncpy( baseName, argv[0], MAXPATHLEN );
-        char *base = basename( baseName );
-        if( NULL != base )
+        char baseName[MAXPATHLEN];
+        strncpy(baseName, argv[0], MAXPATHLEN);
+        char *base = basename(baseName);
+        if (NULL != base)
         {
-            strncpy( appName, base, sizeof( appName )  );
-            appName[ sizeof( appName ) -1 ] = '\0';
+            strncpy(appName, base, sizeof(appName));
+            appName[sizeof(appName) - 1] = '\0';
         }
     }
-#elif defined (_WIN32)
+#elif defined(_WIN32)
     {
         char fname[_MAX_FNAME + _MAX_EXT + 1];
         char ext[_MAX_EXT];
 
-        errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
-                                   fname, _MAX_FNAME, ext, _MAX_EXT );
-        if (err == 0) { // no error
-            strcat (fname, ext); //just cat them, size of frame can keep both
-            strncpy (appName, fname, sizeof(appName));
-            appName[ sizeof( appName ) -1 ] = '\0';
+        errno_t err = _splitpath_s(argv[0], NULL, 0, NULL, 0, fname, _MAX_FNAME,
+                                   ext, _MAX_EXT);
+        if (err == 0)
+        { // no error
+            strcat(fname, ext); // just cat them, size of frame can keep both
+            strncpy(appName, fname, sizeof(appName));
+            appName[sizeof(appName) - 1] = '\0';
         }
     }
 #endif
 
-    vlog( "\n%s", appName );
-    for( i = 1; i < argc; i++ )
+    vlog("\n%s", appName);
+    for (i = 1; i < argc; i++)
     {
         const char *arg = argv[i];
-        if( NULL == arg )
-            break;
+        if (NULL == arg) break;
 
-        vlog( "\t%s", arg );
-        if( arg[0] == '-' )
+        vlog("\t%s", arg);
+        if (arg[0] == '-')
         {
             arg++;
-            while( *arg != '\0' )
+            while (*arg != '\0')
             {
-                switch( *arg )
+                switch (*arg)
                 {
-                    case 'd':
-                        gTestDouble ^= 1;
-                        break;
-                    case 'l':
-                        gSkipTesting ^= 1;
-                        break;
-                    case 'm':
-                        gMultithread ^= 1;
-                        break;
-                    case 'w':
-                        gWimpyMode ^= 1;
-                        break;
+                    case 'd': gTestDouble ^= 1; break;
+                    case 'l': gSkipTesting ^= 1; break;
+                    case 'm': gMultithread ^= 1; break;
+                    case 'w': gWimpyMode ^= 1; break;
                     case '[':
                         parseWimpyReductionFactor(arg, gWimpyReductionFactor);
                         break;
-                    case 'z':
-                        gForceFTZ ^= 1;
-                        break;
-                    case 't':
-                        gTimeResults ^= 1;
-                        break;
-                    case 'a':
-                        gReportAverageTimes ^= 1;
-                        break;
+                    case 'z': gForceFTZ ^= 1; break;
+                    case 't': gTimeResults ^= 1; break;
+                    case 'a': gReportAverageTimes ^= 1; break;
                     case '1':
-                        if( arg[1] == '6' )
+                        if (arg[1] == '6')
                         {
                             gMinVectorSize = 6;
                             gMaxVectorSize = 7;
@@ -445,7 +475,7 @@ static int ParseArgs( int argc, const char **argv )
                         break;
 
                     default:
-                        vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
+                        vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg);
                         PrintUsage();
                         return -1;
                 }
@@ -455,128 +485,136 @@ static int ParseArgs( int argc, const char **argv )
         else
         {
             char *t = NULL;
-            long number = strtol( arg, &t, 0 );
-            if( t != arg )
+            long number = strtol(arg, &t, 0);
+            if (t != arg)
             {
-                if( gStartTestNumber != -1 )
-                    gEndTestNumber = gStartTestNumber + (int) number;
+                if (gStartTestNumber != -1)
+                    gEndTestNumber = gStartTestNumber + (int)number;
                 else
-                    gStartTestNumber = (int) number;
+                    gStartTestNumber = (int)number;
             }
             else
             {
-                argList[ argCount ] = arg;
+                argList[argCount] = arg;
                 argCount++;
             }
         }
     }
 
     // Check for the wimpy mode environment variable
-    if (getenv("CL_WIMPY_MODE")) {
-      vlog( "\n" );
-      vlog( "*** Detected CL_WIMPY_MODE env                          ***\n" );
-      gWimpyMode = 1;
+    if (getenv("CL_WIMPY_MODE"))
+    {
+        vlog("\n");
+        vlog("*** Detected CL_WIMPY_MODE env                          ***\n");
+        gWimpyMode = 1;
     }
 
     vlog( "\n" );
 
     PrintArch();
 
-    if( gWimpyMode )
+    if (gWimpyMode)
     {
-        vlog( "\n" );
-        vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
-        vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
-        vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" );
-        vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor);
+        vlog("\n");
+        vlog("*** WARNING: Testing in Wimpy mode!                     ***\n");
+        vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n");
+        vlog("*** It gives warm fuzzy feelings and then nevers calls. ***\n\n");
+        vlog("*** Wimpy Reduction Factor: %-27u ***\n\n",
+             gWimpyReductionFactor);
     }
 
     return 0;
 }
 
-static void PrintUsage( void )
+static void PrintUsage(void)
 {
     int i;
-    vlog( "%s [-wz#]: <optional: test names>\n", appName );
-    vlog( "\ttest names:\n" );
-    vlog( "\t\tdestFormat<_sat><_round>_sourceFormat\n" );
-    vlog( "\t\t\tPossible format types are:\n\t\t\t\t" );
-    for( i = 0; i < kTypeCount; i++ )
-        vlog( "%s, ", gTypeNames[i] );
-    vlog( "\n\n\t\t\tPossible saturation values are: (empty) and _sat\n" );
-    vlog( "\t\t\tPossible rounding values are:\n\t\t\t\t(empty), " );
-    for( i = 1; i < kRoundingModeCount; i++ )
-        vlog( "%s, ", gRoundingModeNames[i] );
-    vlog( "\n\t\t\tExamples:\n" );
-    vlog( "\t\t\t\tulong_short   converts short to ulong\n" );
-    vlog( "\t\t\t\tchar_sat_rte_float   converts float to char with saturated clipping in round to nearest rounding mode\n\n" );
-    vlog( "\toptions:\n" );
-    vlog( "\t\t-d\tToggle testing of double precision.  On by default if cl_khr_fp64 is enabled, ignored otherwise.\n" );
-    vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" );
-    vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" );
-    vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" );
-    vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
-    vlog( "\t\t-z\tToggle flush to zero mode  (Default: per device)\n" );
-    vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" );
-    vlog( "\n" );
-    vlog( "You may also pass the number of the test on which to start.\nA second number can be then passed to indicate how many tests to run\n\n" );
+    vlog("%s [-wz#]: <optional: test names>\n", appName);
+    vlog("\ttest names:\n");
+    vlog("\t\tdestFormat<_sat><_round>_sourceFormat\n");
+    vlog("\t\t\tPossible format types are:\n\t\t\t\t");
+    for (i = 0; i < kTypeCount; i++) vlog("%s, ", gTypeNames[i]);
+    vlog("\n\n\t\t\tPossible saturation values are: (empty) and _sat\n");
+    vlog("\t\t\tPossible rounding values are:\n\t\t\t\t(empty), ");
+    for (i = 1; i < kRoundingModeCount; i++)
+        vlog("%s, ", gRoundingModeNames[i]);
+    vlog("\n\t\t\tExamples:\n");
+    vlog("\t\t\t\tulong_short   converts short to ulong\n");
+    vlog("\t\t\t\tchar_sat_rte_float   converts float to char with saturated "
+         "clipping in round to nearest rounding mode\n\n");
+    vlog("\toptions:\n");
+    vlog("\t\t-d\tToggle testing of double precision.  On by default if "
+         "cl_khr_fp64 is enabled, ignored otherwise.\n");
+    vlog("\t\t-l\tToggle link check mode. When on, testing is skipped, and we "
+         "just check to see that the kernels build. (Off by default.)\n");
+    vlog("\t\t-m\tToggle Multithreading. (On by default.)\n");
+    vlog("\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very "
+         "small subset of the tests for each fn. NOT A VALID TEST! (Off by "
+         "default.)\n");
+    vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
+         "1-12, default factor(%u)\n",
+         gWimpyReductionFactor);
+    vlog("\t\t-z\tToggle flush to zero mode  (Default: per device)\n");
+    vlog("\t\t-#\tTest just vector size given by #, where # is an element of "
+         "the set {1,2,3,4,8,16}\n");
+    vlog("\n");
+    vlog(
+        "You may also pass the number of the test on which to start.\nA second "
+        "number can be then passed to indicate how many tests to run\n\n");
 }
 
 
-static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round )
+static int GetTestCase(const char *name, Type *outType, Type *inType,
+                       SaturationMode *sat, RoundingMode *round)
 {
     int i;
 
-    //Find the return type
-    for( i = 0; i < kTypeCount; i++ )
-        if( name == strstr( name, gTypeNames[i] ) )
+    // Find the return type
+    for (i = 0; i < kTypeCount; i++)
+        if (name == strstr(name, gTypeNames[i]))
         {
             *outType = (Type)i;
-            name += strlen( gTypeNames[i] );
+            name += strlen(gTypeNames[i]);
 
             break;
         }
 
-    if( i == kTypeCount )
-        return -1;
+    if (i == kTypeCount) return -1;
 
     // Check to see if _sat appears next
     *sat = (SaturationMode)0;
-    for( i = 1; i < kSaturationModeCount; i++ )
-        if( name == strstr( name, gSaturationNames[i] ) )
+    for (i = 1; i < kSaturationModeCount; i++)
+        if (name == strstr(name, gSaturationNames[i]))
         {
             *sat = (SaturationMode)i;
-            name += strlen( gSaturationNames[i] );
+            name += strlen(gSaturationNames[i]);
             break;
         }
 
     *round = (RoundingMode)0;
-    for( i = 1; i < kRoundingModeCount; i++ )
-        if( name == strstr( name, gRoundingModeNames[i] ) )
+    for (i = 1; i < kRoundingModeCount; i++)
+        if (name == strstr(name, gRoundingModeNames[i]))
         {
             *round = (RoundingMode)i;
-            name += strlen( gRoundingModeNames[i] );
+            name += strlen(gRoundingModeNames[i]);
             break;
         }
 
-    if( *name != '_' )
-        return -2;
+    if (*name != '_') return -2;
     name++;
 
-    for( i = 0; i < kTypeCount; i++ )
-        if( name == strstr( name, gTypeNames[i] ) )
+    for (i = 0; i < kTypeCount; i++)
+        if (name == strstr(name, gTypeNames[i]))
         {
             *inType = (Type)i;
-            name += strlen( gTypeNames[i] );
+            name += strlen(gTypeNames[i]);
 
             break;
         }
 
-    if( i == kTypeCount )
-        return -3;
+    if (i == kTypeCount) return -3;
 
-    if( *name != '\0' )
-        return -4;
+    if (*name != '\0') return -4;
 
     return 0;
 }
@@ -584,270 +622,292 @@ static int GetTestCase( const char *name, Type *outType, Type *inType, Saturatio
 #pragma mark -
 #pragma mark OpenCL
 
-test_status InitCL( cl_device_id device )
+test_status InitCL(cl_device_id device)
 {
     int error, i;
-    size_t configSize = sizeof( gComputeDevices );
+    size_t configSize = sizeof(gComputeDevices);
 
-    if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) )
+    if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                 configSize, &gComputeDevices, NULL)))
         gComputeDevices = 1;
 
-    configSize = sizeof( gDeviceFrequency );
-    if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) )
+    configSize = sizeof(gDeviceFrequency);
+    if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                                 configSize, &gDeviceFrequency, NULL)))
         gDeviceFrequency = 0;
 
     cl_device_fp_config floatCapabilities = 0;
-    if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities,  NULL)))
+    if ((error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG,
+                                 sizeof(floatCapabilities), &floatCapabilities,
+                                 NULL)))
         floatCapabilities = 0;
-    if(0 == (CL_FP_DENORM & floatCapabilities) )
-        gForceFTZ ^= 1;
+    if (0 == (CL_FP_DENORM & floatCapabilities)) gForceFTZ ^= 1;
 
-    if( 0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST ) )
+    if (0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST))
     {
         char profileStr[128] = "";
         // Verify that we are an embedded profile device
-        if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( profileStr ), profileStr, NULL ) ) )
+        if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE,
+                                     sizeof(profileStr), profileStr, NULL)))
         {
-            vlog_error( "FAILURE: Could not get device profile: error %d\n", error );
+            vlog_error("FAILURE: Could not get device profile: error %d\n",
+                       error);
             return TEST_FAIL;
         }
 
-        if( strcmp( profileStr, "EMBEDDED_PROFILE" ) )
+        if (strcmp(profileStr, "EMBEDDED_PROFILE"))
         {
-            vlog_error( "FAILURE: non-embedded profile device does not support CL_FP_ROUND_TO_NEAREST\n" );
+            vlog_error("FAILURE: non-embedded profile device does not support "
+                       "CL_FP_ROUND_TO_NEAREST\n");
             return TEST_FAIL;
         }
 
-        if( 0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO ) )
+        if (0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO))
         {
-            vlog_error( "FAILURE: embedded profile device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" );
+            vlog_error("FAILURE: embedded profile device supports neither "
+                       "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n");
             return TEST_FAIL;
         }
 
         gIsRTZ = 1;
     }
 
-    else if(is_extension_available(device, "cl_khr_fp64"))
+    else if (is_extension_available(device, "cl_khr_fp64"))
     {
         gHasDouble = 1;
     }
     gTestDouble &= gHasDouble;
 
-    //detect whether profile of the device is embedded
+    // detect whether profile of the device is embedded
     char profile[1024] = "";
-    if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ) ) ){}
-    else if( strstr(profile, "EMBEDDED_PROFILE" ) )
+    if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
+                                 profile, NULL)))
+    {
+    }
+    else if (strstr(profile, "EMBEDDED_PROFILE"))
     {
         gIsEmbedded = 1;
-        if( !is_extension_available(device, "cles_khr_int64" ) )
-            gHasLong = 0;
+        if (!is_extension_available(device, "cles_khr_int64")) gHasLong = 0;
     }
 
 
-    gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error );
-    if( NULL == gContext || error )
+    gContext = clCreateContext(NULL, 1, &device, notify_callback, NULL, &error);
+    if (NULL == gContext || error)
     {
-        vlog_error( "clCreateContext failed. (%d)\n", error );
+        vlog_error("clCreateContext failed. (%d)\n", error);
         return TEST_FAIL;
     }
 
     gQueue = clCreateCommandQueue(gContext, device, 0, &error);
-    if( NULL == gQueue || error )
+    if (NULL == gQueue || error)
     {
-        vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+        vlog_error("clCreateCommandQueue failed. (%d)\n", error);
         return TEST_FAIL;
     }
 
-    //Allocate buffers
-    //FIXME: use clProtectedArray for guarded allocations?
-    gIn   = malloc( BUFFER_SIZE + 2 * kPageSize );
-    gAllowZ = malloc( BUFFER_SIZE + 2 * kPageSize );
-    gRef  = malloc( BUFFER_SIZE + 2 * kPageSize );
-    for( i = 0; i < kCallStyleCount; i++ )
+    // Allocate buffers
+    // FIXME: use clProtectedArray for guarded allocations?
+    gIn = malloc(BUFFER_SIZE + 2 * kPageSize);
+    gAllowZ = malloc(BUFFER_SIZE + 2 * kPageSize);
+    gRef = malloc(BUFFER_SIZE + 2 * kPageSize);
+    for (i = 0; i < kCallStyleCount; i++)
     {
-        gOut[i] = malloc( BUFFER_SIZE + 2 * kPageSize );
-        if( NULL == gOut[i] )
-            return TEST_FAIL;
+        gOut[i] = malloc(BUFFER_SIZE + 2 * kPageSize);
+        if (NULL == gOut[i]) return TEST_FAIL;
     }
 
     // setup input buffers
-    gInBuffer = clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error);
-    if( gInBuffer == NULL || error)
+    gInBuffer =
+        clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                       BUFFER_SIZE, NULL, &error);
+    if (gInBuffer == NULL || error)
     {
-        vlog_error( "clCreateBuffer failed for input (%d)\n", error );
+        vlog_error("clCreateBuffer failed for input (%d)\n", error);
         return TEST_FAIL;
     }
 
     // setup output buffers
-    for( i = 0; i < kCallStyleCount; i++ )
+    for (i = 0; i < kCallStyleCount; i++)
     {
-        gOutBuffers[i] = clCreateBuffer(  gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error );
-        if( gOutBuffers[i] == NULL || error )
+        gOutBuffers[i] =
+            clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                           BUFFER_SIZE, NULL, &error);
+        if (gOutBuffers[i] == NULL || error)
         {
-            vlog_error( "clCreateArray failed for output (%d)\n", error );
+            vlog_error("clCreateArray failed for output (%d)\n", error);
             return TEST_FAIL;
         }
     }
 
 
-    gMTdata = init_genrand( gRandomSeed );
+    gMTdata = init_genrand(gRandomSeed);
 
 
     char c[1024];
     static const char *no_yes[] = { "NO", "YES" };
-    vlog( "\nCompute Device info:\n" );
+    vlog("\nCompute Device info:\n");
     clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL);
-    vlog( "\tDevice Name: %s\n", c );
+    vlog("\tDevice Name: %s\n", c);
     clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
-    vlog( "\tVendor: %s\n", c );
+    vlog("\tVendor: %s\n", c);
     clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL);
-    vlog( "\tDevice Version: %s\n", c );
+    vlog("\tDevice Version: %s\n", c);
     clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
-    vlog( "\tCL C Version: %s\n", c );
+    vlog("\tCL C Version: %s\n", c);
     clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL);
-    vlog( "\tDriver Version: %s\n", c );
-    vlog( "\tProcessing with %ld devices\n", gComputeDevices );
-    vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency );
-    vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
-    vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ] );
-    vlog( "\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ] );
-    vlog( "\tHas Double? %s\n", no_yes[0 != gHasDouble] );
-    if( gHasDouble )
-        vlog( "\tTest Double? %s\n", no_yes[0 != gTestDouble] );
-    vlog( "\tHas Long? %s\n", no_yes[0 != gHasLong] );
-    vlog( "\tTesting vector sizes: " );
-    for( i = gMinVectorSize; i < gMaxVectorSize; i++ )
+    vlog("\tDriver Version: %s\n", c);
+    vlog("\tProcessing with %ld devices\n", gComputeDevices);
+    vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency);
+    vlog("\tSubnormal values supported for floats? %s\n",
+         no_yes[0 != (CL_FP_DENORM & floatCapabilities)]);
+    vlog("\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ]);
+    vlog("\tTesting with default RTZ mode for floats? %s\n",
+         no_yes[0 != gIsRTZ]);
+    vlog("\tHas Double? %s\n", no_yes[0 != gHasDouble]);
+    if (gHasDouble) vlog("\tTest Double? %s\n", no_yes[0 != gTestDouble]);
+    vlog("\tHas Long? %s\n", no_yes[0 != gHasLong]);
+    vlog("\tTesting vector sizes: ");
+    for (i = gMinVectorSize; i < gMaxVectorSize; i++)
         vlog("\t%d", vectorSizes[i]);
-    vlog( "\n" );
+    vlog("\n");
     return TEST_PASS;
 }
 
-static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount )
+static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
+                     size_t blockCount)
 {
-    // The global dimensions are just the blockCount to execute since we haven't set up multiple queues for multiple devices.
+    // The global dimensions are just the blockCount to execute since we haven't
+    // set up multiple queues for multiple devices.
     int error;
 
-    error = clSetKernelArg(kernel, 0, sizeof( inBuf ), &inBuf);
+    error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
     error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
 
-    if( error )
+    if (error)
     {
-        vlog_error( "FAILED -- could not set kernel args (%d)\n", error );
+        vlog_error("FAILED -- could not set kernel args (%d)\n", error);
         return error;
     }
 
-    if( (error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, NULL, 0, NULL, NULL)))
+    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
+                                        NULL, 0, NULL, NULL)))
     {
-        vlog_error( "FAILED -- could not execute kernel (%d)\n", error );
+        vlog_error("FAILED -- could not execute kernel (%d)\n", error);
         return error;
     }
 
     return 0;
 }
 
-#if ! defined( __APPLE__ )
-void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
+#if !defined(__APPLE__)
+void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
 #endif
 
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
 #include <mach/mach_time.h>
 #endif
 
-uint64_t GetTime( void );
-uint64_t GetTime( void )
+uint64_t GetTime(void);
+uint64_t GetTime(void)
 {
-#if defined( __APPLE__ )
+#if defined(__APPLE__)
     return mach_absolute_time();
 #elif defined(_MSC_VER)
-    return  ReadTime();
+    return ReadTime();
 #else
-    //mach_absolute_time is a high precision timer with precision < 1 microsecond.
+    // mach_absolute_time is a high precision timer with precision < 1
+    // microsecond.
 #warning need accurate clock here.  Times are invalid.
     return 0;
 #endif
 }
 
 
-#if defined (_MSC_VER)
+#if defined(_MSC_VER)
 /* function is defined in "compat.h" */
 #else
-double SubtractTime( uint64_t endTime, uint64_t startTime );
-double SubtractTime( uint64_t endTime, uint64_t startTime )
+double SubtractTime(uint64_t endTime, uint64_t startTime);
+double SubtractTime(uint64_t endTime, uint64_t startTime)
 {
     uint64_t diff = endTime - startTime;
     static double conversion = 0.0;
 
-    if( 0.0 == conversion )
+    if (0.0 == conversion)
     {
-#if defined( __APPLE__ )
-        mach_timebase_info_data_t info = {0,0};
-        kern_return_t   err = mach_timebase_info( &info );
-        if( 0 == err )
-            conversion = 1e-9 * (double) info.numer / (double) info.denom;
+#if defined(__APPLE__)
+        mach_timebase_info_data_t info = { 0, 0 };
+        kern_return_t err = mach_timebase_info(&info);
+        if (0 == err)
+            conversion = 1e-9 * (double)info.numer / (double)info.denom;
 #else
-        // This function consumes output from GetTime() above, and converts the time to secionds.
+        // This function consumes output from GetTime() above, and converts the
+        // time to secionds.
 #warning need accurate ticks to seconds conversion factor here. Times are invalid.
 #endif
     }
 
     // strictly speaking we should also be subtracting out timer latency here
-    return conversion * (double) diff;
+    return conversion * (double)diff;
 }
 #endif
 
 typedef struct CalcReferenceValuesInfo
 {
-    struct WriteInputBufferInfo *parent;        // pointer back to the parent WriteInputBufferInfo struct
-    cl_kernel                   kernel;         // the kernel for this vector size
-    cl_program                  program;        // the program for this vector size
-    cl_uint                     vectorSize;     // the vector size for this callback chain
-    void                        *p;             // the pointer to mapped result data for this vector size
-    cl_int                      result;
-}CalcReferenceValuesInfo;
+    struct WriteInputBufferInfo
+        *parent; // pointer back to the parent WriteInputBufferInfo struct
+    cl_kernel kernel; // the kernel for this vector size
+    cl_program program; // the program for this vector size
+    cl_uint vectorSize; // the vector size for this callback chain
+    void *p; // the pointer to mapped result data for this vector size
+    cl_int result;
+} CalcReferenceValuesInfo;
 
 typedef struct WriteInputBufferInfo
 {
-    volatile cl_event           calcReferenceValues;   // user event which signals when main thread is done calculating reference values
-    volatile cl_event           doneBarrier;     // user event which signals when worker threads are done
-    cl_uint                     count;           // the number of elements in the array
-    Type                        outType;         // the data type of the conversion result
-    Type                        inType;          // the data type of the conversion input
-    volatile int                barrierCount;
-    CalcReferenceValuesInfo     calcInfo[kCallStyleCount];
-}WriteInputBufferInfo;
+    volatile cl_event
+        calcReferenceValues; // user event which signals when main thread is
+                             // done calculating reference values
+    volatile cl_event
+        doneBarrier; // user event which signals when worker threads are done
+    cl_uint count; // the number of elements in the array
+    Type outType; // the data type of the conversion result
+    Type inType; // the data type of the conversion input
+    volatile int barrierCount;
+    CalcReferenceValuesInfo calcInfo[kCallStyleCount];
+} WriteInputBufferInfo;
 
-cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
-cl_uint RoundUpToNextPowerOfTwo( cl_uint x )
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
-    if( 0 == (x & (x-1)))
-        return x;
+    if (0 == (x & (x - 1))) return x;
 
-    while( x & (x-1) )
-       x &= x-1;
+    while (x & (x - 1)) x &= x - 1;
 
     return x + x;
 }
 
-void CL_CALLBACK WriteInputBufferComplete( cl_event, cl_int, void * );
+void WriteInputBufferComplete(void *);
 
 typedef struct DataInitInfo
 {
-    cl_ulong        start;
-    cl_uint         size;
-    Type            outType;
-    Type            inType;
-    SaturationMode  sat;
-    RoundingMode    round;
-    MTdata          *d;
-}DataInitInfo;
+    cl_ulong start;
+    cl_uint size;
+    Type outType;
+    Type inType;
+    SaturationMode sat;
+    RoundingMode round;
+    MTdata *d;
+} DataInitInfo;
 
-cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p );
-cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p )
+cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
+cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
 {
-    DataInitInfo *info = (DataInitInfo*) p;
+    DataInitInfo *info = (DataInitInfo *)p;
 
-    gInitFunctions[ info->inType ]( (char*)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, info->round,
-                                   info->outType, info->start + job_id * info->size, info->size, info->d[thread_id] );
+    gInitFunctions[info->inType](
+        (char *)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat,
+        info->round, info->outType, info->start + job_id * info->size,
+        info->size, info->d[thread_id]);
     return CL_SUCCESS;
 }
 
@@ -855,13 +915,13 @@ static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
 {
     cl_uint i;
     for (i = 0; i < count; ++i)
-    allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
+        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
 }
 
-cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p );
-cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
+cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
+cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
 {
-    DataInitInfo *info = (DataInitInfo*) p;
+    DataInitInfo *info = (DataInitInfo *)p;
     cl_uint count = info->size;
     Type inType = info->inType;
     Type outType = info->outType;
@@ -870,16 +930,15 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
 
     Force64BitFPUPrecision();
 
-    void *s = (cl_uchar*) gIn + job_id * count * gTypeSizes[info->inType];
-    void *a = (cl_uchar*) gAllowZ + job_id * count;
-    void *d = (cl_uchar*) gRef + job_id * count * gTypeSizes[info->outType];
+    void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
+    void *a = (cl_uchar *)gAllowZ + job_id * count;
+    void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];
 
     if (outType != inType)
     {
-        //create the reference while we wait
-        Convert f = gConversions[ outType ][ inType ];
-        if( info->sat )
-            f = gSaturatedConversions[ outType ][ inType ];
+        // create the reference while we wait
+        Convert f = gConversions[outType][inType];
+        if (info->sat) f = gSaturatedConversions[outType][inType];
 
 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
         /* ARM VFP doesn't have hardware instruction for converting from 64-bit
@@ -896,42 +955,34 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
         switch (round)
         {
             /* conversions to floating-point type use the current rounding mode.
-             * The only default floating-point rounding mode supported is round to nearest even
-             * i.e the current rounding mode will be _rte for floating-point types. */
-            case kDefaultRoundingMode:
-                    qcom_rm = qcomRTE;
-                    break;
-            case kRoundToNearestEven:
-                    qcom_rm = qcomRTE;
-                    break;
-            case kRoundUp:
-                    qcom_rm = qcomRTP;
-                    break;
-            case kRoundDown:
-                    qcom_rm = qcomRTN;
-                    break;
-            case kRoundTowardZero:
-                    qcom_rm = qcomRTZ;
-                    break;
+             * The only default floating-point rounding mode supported is round
+             * to nearest even i.e the current rounding mode will be _rte for
+             * floating-point types. */
+            case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
+            case kRoundToNearestEven: qcom_rm = qcomRTE; break;
+            case kRoundUp: qcom_rm = qcomRTP; break;
+            case kRoundDown: qcom_rm = qcomRTN; break;
+            case kRoundTowardZero: qcom_rm = qcomRTZ; break;
             default:
-                    vlog_error("ERROR: undefined rounding mode %d\n", round);
-                    break;
+                vlog_error("ERROR: undefined rounding mode %d\n", round);
+                break;
         }
-        qcom_sat =  info->sat;
+        qcom_sat = info->sat;
 #endif
 
-        RoundingMode oldRound = set_round( round, outType );
-        f( d, s, count );
-        set_round( oldRound, outType );
+        RoundingMode oldRound = set_round(round, outType);
+        f(d, s, count);
+        set_round(oldRound, outType);
 
-    // Decide if we allow a zero result in addition to the correctly rounded one
+        // Decide if we allow a zero result in addition to the correctly rounded
+        // one
         memset(a, 0, count);
-    if (gForceFTZ) {
-        if (inType == kfloat)
-        setAllowZ((uint8_t*)a, (uint32_t*)s, count);
-        if (outType == kfloat)
-        setAllowZ((uint8_t*)a, (uint32_t*)d, count);
-    }
+        if (gForceFTZ)
+        {
+            if (inType == kfloat) setAllowZ((uint8_t *)a, (uint32_t *)s, count);
+            if (outType == kfloat)
+                setAllowZ((uint8_t *)a, (uint32_t *)d, count);
+        }
     }
     else
     {
@@ -939,46 +990,48 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
         memcpy(d, s, info->size * gTypeSizes[inType]);
     }
 
-    //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
-    if( info->outType != kfloat && info->outType != kdouble )
+    // Patch up NaNs conversions to integer to zero -- these can be converted to
+    // any integer
+    if (info->outType != kfloat && info->outType != kdouble)
     {
-        if( inType == kfloat )
+        if (inType == kfloat)
         {
-            float *inp = (float*) s;
-            for( j = 0; j < count; j++ )
+            float *inp = (float *)s;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) )
-                    memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
+                if (isnan(inp[j]))
+                    memset((char *)d + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
             }
         }
-        if( inType == kdouble )
+        if (inType == kdouble)
         {
-            double *inp = (double*) s;
-            for( j = 0; j < count; j++ )
+            double *inp = (double *)s;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) )
-                    memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
+                if (isnan(inp[j]))
+                    memset((char *)d + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
             }
         }
     }
-    else if( inType == kfloat || inType == kdouble )
-    {  // outtype and intype is float or double.  NaN conversions for float <-> double can be any NaN
-        if( inType == kfloat && outType == kdouble )
+    else if (inType == kfloat || inType == kdouble)
+    { // outtype and intype is float or double.  NaN conversions for float <->
+      // double can be any NaN
+        if (inType == kfloat && outType == kdouble)
         {
-            float *inp = (float*) s;
-            for( j = 0; j < count; j++ )
+            float *inp = (float *)s;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) )
-                    ((double*) d)[j] = NAN;
+                if (isnan(inp[j])) ((double *)d)[j] = NAN;
             }
         }
-        if( inType == kdouble && outType == kfloat )
+        if (inType == kdouble && outType == kfloat)
         {
-            double *inp = (double*) s;
-            for( j = 0; j < count; j++ )
+            double *inp = (double *)s;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) )
-                    ((float*) d)[j] = NAN;
+                if (isnan(inp[j])) ((float *)d)[j] = NAN;
             }
         }
     }
@@ -986,13 +1039,14 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
     return CL_SUCCESS;
 }
 
-static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d )
+static int DoTest(cl_device_id device, Type outType, Type inType,
+                  SaturationMode sat, RoundingMode round, MTdata d)
 {
 #ifdef __APPLE__
     cl_ulong wall_start = mach_absolute_time();
 #endif
 
-    DataInitInfo  init_info = { 0, 0, outType, inType, sat, round, NULL };
+    DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
     WriteInputBufferInfo writeInputBufferInfo;
     int vectorSize;
     int error = 0;
@@ -1003,22 +1057,23 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
     size_t blockCount =
         BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
     size_t step = blockCount;
-    uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]);
-    cl_event writeInputBuffer = NULL;
+    uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);
 
-    memset( &writeInputBufferInfo, 0, sizeof( writeInputBufferInfo ) );
-    init_info.d = (MTdata*)malloc( threads * sizeof( MTdata ) );
-    if( NULL == init_info.d )
+    memset(&writeInputBufferInfo, 0, sizeof(writeInputBufferInfo));
+    init_info.d = (MTdata *)malloc(threads * sizeof(MTdata));
+    if (NULL == init_info.d)
     {
-        vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
+        vlog_error(
+            "ERROR: Unable to allocate storage for random number generator!\n");
         return -1;
     }
-    for( i = 0; i < threads; i++ )
+    for (i = 0; i < threads; i++)
     {
-        init_info.d[i] = init_genrand( genrand_int32( d ) );
-        if( NULL == init_info.d[i] )
+        init_info.d[i] = init_genrand(genrand_int32(d));
+        if (NULL == init_info.d[i])
         {
-            vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
+            vlog_error("ERROR: Unable to allocate storage for random number "
+                       "generator!\n");
             return -1;
         }
     }
@@ -1026,52 +1081,53 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
     writeInputBufferInfo.outType = outType;
     writeInputBufferInfo.inType = inType;
 
-    for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
-        writeInputBufferInfo.calcInfo[vectorSize].program = MakeProgram( outType, inType, sat, round, vectorSize,
-                                                                        &writeInputBufferInfo.calcInfo[vectorSize].kernel );
-        if( NULL == writeInputBufferInfo.calcInfo[vectorSize].program )
+        writeInputBufferInfo.calcInfo[vectorSize].program =
+            MakeProgram(outType, inType, sat, round, vectorSize,
+                        &writeInputBufferInfo.calcInfo[vectorSize].kernel);
+        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].program)
         {
             gFailCount++;
             return -1;
         }
-        if( NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel )
+        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel)
         {
             gFailCount++;
-            vlog_error( "\t\tFAILED -- Failed to create kernel.\n" );
+            vlog_error("\t\tFAILED -- Failed to create kernel.\n");
             return -2;
         }
 
-        writeInputBufferInfo.calcInfo[vectorSize].parent = &writeInputBufferInfo;
+        writeInputBufferInfo.calcInfo[vectorSize].parent =
+            &writeInputBufferInfo;
         writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
         writeInputBufferInfo.calcInfo[vectorSize].result = -1;
     }
 
-    if( gSkipTesting )
-        goto exit;
+    if (gSkipTesting) goto exit;
 
     // Patch up rounding mode if default is RTZ
-    // We leave the part above in default rounding mode so that the right kernel is compiled.
-    if( round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat) )
+    // We leave the part above in default rounding mode so that the right kernel
+    // is compiled.
+    if (round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat))
         init_info.round = round = kRoundTowardZero;
 
     // Figure out how many elements are in a work block
 
     // we handle 64-bit types a bit differently.
-    if( 8*gTypeSizes[ inType ] > 32 )
-        lastCase = 0x100000000ULL;
+    if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;
 
-    if ( !gWimpyMode && gIsEmbedded )
-      step = blockCount * EMBEDDED_REDUCTION_FACTOR;
+    if (!gWimpyMode && gIsEmbedded)
+        step = blockCount * EMBEDDED_REDUCTION_FACTOR;
 
-    if ( gWimpyMode )
-        step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
-    vlog( "Testing... " );
+    if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
+    vlog("Testing... ");
     fflush(stdout);
-    for( i = 0; i < (uint64_t)lastCase; i += step )
+    for (i = 0; i < (uint64_t)lastCase; i += step)
     {
 
-        if( 0 == ( i & ((lastCase >> 3) -1))) {
+        if (0 == (i & ((lastCase >> 3) - 1)))
+        {
             vlog(".");
             fflush(stdout);
         }
@@ -1079,53 +1135,61 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
         cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
         writeInputBufferInfo.count = count;
 
-        // Crate a user event to represent the status of the reference value computation completion
-        writeInputBufferInfo.calcReferenceValues = clCreateUserEvent( gContext, &error);
-        if( error || NULL == writeInputBufferInfo.calcReferenceValues )
+        // Crate a user event to represent the status of the reference value
+        // computation completion
+        writeInputBufferInfo.calcReferenceValues =
+            clCreateUserEvent(gContext, &error);
+        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
         {
-            vlog_error( "ERROR: Unable to create user event. (%d)\n", error );
+            vlog_error("ERROR: Unable to create user event. (%d)\n", error);
             gFailCount++;
             goto exit;
         }
 
         // retain for consumption by MapOutputBufferComplete
-        for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
         {
-            if( (error = clRetainEvent(writeInputBufferInfo.calcReferenceValues) ))
+            if ((error =
+                     clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
             {
-                vlog_error( "ERROR: Unable to retain user event. (%d)\n", error );
+                vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
                 gFailCount++;
                 goto exit;
             }
         }
 
-        // Crate a user event to represent when the callbacks are done verifying correctness
-        writeInputBufferInfo.doneBarrier = clCreateUserEvent( gContext, &error);
-        if( error || NULL == writeInputBufferInfo.calcReferenceValues )
+        // Crate a user event to represent when the callbacks are done verifying
+        // correctness
+        writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
+        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
         {
-            vlog_error( "ERROR: Unable to create user event for barrier. (%d)\n", error );
+            vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
+                       error);
             gFailCount++;
             goto exit;
         }
 
         // retain for use by the callback that calls this
-        if( (error = clRetainEvent(writeInputBufferInfo.doneBarrier) ))
+        if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
         {
-            vlog_error( "ERROR: Unable to retain user event doneBarrier. (%d)\n", error );
+            vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
+                       error);
             gFailCount++;
             goto exit;
         }
 
         //      Call this in a multithreaded manner
-        //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d );
+        //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d
+        //      );
         cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
         init_info.start = i;
         init_info.size = count / chunks;
-        if( init_info.size < 16384 )
+        if (init_info.size < 16384)
         {
             chunks = RoundUpToNextPowerOfTwo(threads);
             init_info.size = count / chunks;
-            if( init_info.size < 16384 )
+            if (init_info.size < 16384)
             {
                 init_info.size = count;
                 chunks = 1;
@@ -1134,43 +1198,23 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
         ThreadPool_Do(InitData, chunks, &init_info);
 
         // Copy the results to the device
-        writeInputBuffer = NULL;
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, count * gTypeSizes[inType], gIn, 0, NULL, &writeInputBuffer )))
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          count * gTypeSizes[inType], gIn, 0,
+                                          NULL, NULL)))
         {
-            vlog_error( "ERROR: clEnqueueWriteBuffer failed. (%d)\n", error );
+            vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
             gFailCount++;
             goto exit;
         }
 
-        // Setup completion callback for the write, which will enqueue the rest of the work
-        // This is somewhat gratuitous.  Because this is an in order queue, we didn't really need to
-        // do this work in a callback. We could have done it from the main thread.  Here we are
-        // verifying that the implementation can enqueue work from a callback, while at the same time
-        // also checking to make sure that the conversions work.
-        //
-        // Because the verification code is also moved to a callback, it is hoped that implementations will
-        // achieve a test performance improvement because they can verify the results in parallel.  If the
-        // implementation serializes callbacks however, that won't happen.   Consider it some motivation
-        // to do the right thing! :-)
-        if( (error = clSetEventCallback( writeInputBuffer, CL_COMPLETE, WriteInputBufferComplete, &writeInputBufferInfo)) )
-        {
-            vlog_error( "ERROR: clSetEventCallback failed. (%d)\n", error );
-            gFailCount++;
-            goto exit;
-        }
-
-        // The event can't be destroyed until the callback is called, so we can release it now.
-        if( (error = clReleaseEvent(writeInputBuffer) ))
-        {
-            vlog_error( "ERROR: clReleaseEvent failed. (%d)\n", error );
-            gFailCount++;
-            goto exit;
-        }
+        // Call completion callback for the write, which will enqueue the rest
+        // of the work.
+        WriteInputBufferComplete((void *)&writeInputBufferInfo);
 
         // Make sure the work is actually running, so we don't deadlock
-        if( (error = clFlush( gQueue ) ) )
+        if ((error = clFlush(gQueue)))
         {
-            vlog_error( "clFlush failed with error %d\n", error );
+            vlog_error("clFlush failed with error %d\n", error);
             gFailCount++;
             goto exit;
         }
@@ -1178,77 +1222,91 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
         ThreadPool_Do(PrepareReference, chunks, &init_info);
 
         // signal we are done calculating the reference results
-        if( (error = clSetUserEventStatus( writeInputBufferInfo.calcReferenceValues, CL_COMPLETE ) ) )
+        if ((error = clSetUserEventStatus(
+                 writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
         {
-            vlog_error( "Error:  Failed to set user event status to CL_COMPLETE:  %d\n", error );
+            vlog_error(
+                "Error:  Failed to set user event status to CL_COMPLETE:  %d\n",
+                error);
             gFailCount++;
             goto exit;
         }
 
         // Wait for the event callbacks to finish verifying correctness.
-        if( (error = clWaitForEvents( 1, (cl_event*) &writeInputBufferInfo.doneBarrier ) ))
+        if ((error = clWaitForEvents(
+                 1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
         {
-            vlog_error( "Error:  Failed to wait for barrier:  %d\n", error );
+            vlog_error("Error:  Failed to wait for barrier:  %d\n", error);
             gFailCount++;
             goto exit;
         }
 
-        if( (error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues ) ))
+        if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
         {
-            vlog_error( "Error:  Failed to release calcReferenceValues:  %d\n", error );
+            vlog_error("Error:  Failed to release calcReferenceValues:  %d\n",
+                       error);
             gFailCount++;
             goto exit;
         }
 
-        if( (error = clReleaseEvent(writeInputBufferInfo.doneBarrier ) ))
+        if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
         {
-            vlog_error( "Error:  Failed to release done barrier:  %d\n", error );
+            vlog_error("Error:  Failed to release done barrier:  %d\n", error);
             gFailCount++;
             goto exit;
         }
 
 
-        for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
         {
-            if( ( error = writeInputBufferInfo.calcInfo[ vectorSize ].result ))
+            if ((error = writeInputBufferInfo.calcInfo[vectorSize].result))
             {
-                switch( inType )
+                switch (inType)
                 {
                     case kuchar:
                     case kchar:
-                        vlog( "Input value: 0x%2.2x ", ((unsigned char*)gIn)[error - 1] );
+                        vlog("Input value: 0x%2.2x ",
+                             ((unsigned char *)gIn)[error - 1]);
                         break;
                     case kushort:
                     case kshort:
-                        vlog( "Input value: 0x%4.4x ", ((unsigned short*)gIn)[error - 1] );
+                        vlog("Input value: 0x%4.4x ",
+                             ((unsigned short *)gIn)[error - 1]);
                         break;
                     case kuint:
                     case kint:
-                        vlog( "Input value: 0x%8.8x ", ((unsigned int*)gIn)[error - 1] );
+                        vlog("Input value: 0x%8.8x ",
+                             ((unsigned int *)gIn)[error - 1]);
                         break;
                     case kfloat:
-                        vlog( "Input value: %a ", ((float*)gIn)[error - 1] );
+                        vlog("Input value: %a ", ((float *)gIn)[error - 1]);
                         break;
                         break;
                     case kulong:
                     case klong:
-                        vlog( "Input value: 0x%16.16llx ", ((unsigned long long*)gIn)[error - 1] );
+                        vlog("Input value: 0x%16.16llx ",
+                             ((unsigned long long *)gIn)[error - 1]);
                         break;
                     case kdouble:
-                        vlog( "Input value: %a ", ((double*)gIn)[error - 1]);
+                        vlog("Input value: %a ", ((double *)gIn)[error - 1]);
                         break;
                     default:
-                        vlog_error( "Internal error at %s: %d\n", __FILE__, __LINE__ );
+                        vlog_error("Internal error at %s: %d\n", __FILE__,
+                                   __LINE__);
                         abort();
                         break;
                 }
 
                 // tell the user which conversion it was.
-                if( 0 == vectorSize )
-                    vlog( " (implicit scalar conversion from %s to %s)\n", gTypeNames[ inType ], gTypeNames[ outType ] );
+                if (0 == vectorSize)
+                    vlog(" (implicit scalar conversion from %s to %s)\n",
+                         gTypeNames[inType], gTypeNames[outType]);
                 else
-                    vlog( " (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], sizeNames[vectorSize], gSaturationNames[ sat ],
-                                                            gRoundingModeNames[ round ], gTypeNames[inType], sizeNames[vectorSize] );
+                    vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
+                         sizeNames[vectorSize], gSaturationNames[sat],
+                         gRoundingModeNames[round], gTypeNames[inType],
+                         sizeNames[vectorSize]);
 
                 gFailCount++;
                 goto exit;
@@ -1256,300 +1314,318 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod
         }
     }
 
-    log_info( "done.\n" );
+    log_info("done.\n");
 
-    if( gTimeResults )
+    if (gTimeResults)
     {
-        //Kick off tests for the various vector lengths
-        for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+        // Kick off tests for the various vector lengths
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
         {
             size_t workItemCount = blockCount / vectorSizes[vectorSize];
-            if( vectorSizes[vectorSize] * gTypeSizes[outType] < 4 )
-                workItemCount /= 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
+            if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
+                workItemCount /=
+                    4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
 
             double sum = 0.0;
             double bestTime = INFINITY;
             cl_uint k;
-            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
                 uint64_t startTime = GetTime();
-                if( (error = RunKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
+                if ((error = RunKernel(
+                         writeInputBufferInfo.calcInfo[vectorSize].kernel,
+                         gInBuffer, gOutBuffers[vectorSize], workItemCount)))
                 {
                     gFailCount++;
                     goto exit;
                 }
 
                 // Make sure OpenCL is done
-                if( (error = clFinish(gQueue) ) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Error %d at clFinish\n", error );
+                    vlog_error("Error %d at clFinish\n", error);
                     goto exit;
                 }
 
                 uint64_t endTime = GetTime();
-                double time = SubtractTime( endTime, startTime );
+                double time = SubtractTime(endTime, startTime);
                 sum += time;
-                if( time < bestTime )
-                    bestTime = time;
-
+                if (time < bestTime) bestTime = time;
             }
 
-            if( gReportAverageTimes )
-                bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (workItemCount * vectorSizes[vectorSize]);
-            if( 0 == vectorSize )
-                vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "implicit convert %s -> %s", gTypeNames[ inType ], gTypeNames[ outType ] );
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (workItemCount * vectorSizes[vectorSize]);
+            if (0 == vectorSize)
+                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
+                          "implicit convert %s -> %s", gTypeNames[inType],
+                          gTypeNames[outType]);
             else
-                vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "convert_%s%s%s%s( %s%s )", gTypeNames[ outType ], sizeNames[vectorSize], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType], sizeNames[vectorSize] );
+                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
+                          "convert_%s%s%s%s( %s%s )", gTypeNames[outType],
+                          sizeNames[vectorSize], gSaturationNames[sat],
+                          gRoundingModeNames[round], gTypeNames[inType],
+                          sizeNames[vectorSize]);
         }
     }
 
-    if( gWimpyMode )
-        vlog( "\tWimp pass" );
+    if (gWimpyMode)
+        vlog("\tWimp pass");
     else
-        vlog( "\tpassed" );
+        vlog("\tpassed");
 
 #ifdef __APPLE__
     // record the run time
-    vlog( "\t(%f s)", 1e-9 * ( mach_absolute_time() - wall_start ) );
+    vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
 #endif
-    vlog( "\n\n" );
-    fflush( stdout );
+    vlog("\n\n");
+    fflush(stdout);
 
 
 exit:
-    //clean up
-    for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    // clean up
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
-        clReleaseProgram( writeInputBufferInfo.calcInfo[vectorSize].program );
-        clReleaseKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel );
+        clReleaseProgram(writeInputBufferInfo.calcInfo[vectorSize].program);
+        clReleaseKernel(writeInputBufferInfo.calcInfo[vectorSize].kernel);
     }
 
-    if( init_info.d )
+    if (init_info.d)
     {
-        for( i = 0; i < threads; i++ )
-            free_mtdata(init_info.d[i]);
+        for (i = 0; i < threads; i++) free_mtdata(init_info.d[i]);
         free(init_info.d);
     }
 
     return error;
 }
 
-void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data );
+void MapResultValuesComplete(void *data);
 
 // Note: not called reentrantly
-void CL_CALLBACK WriteInputBufferComplete( cl_event e, cl_int status, void *data )
+void WriteInputBufferComplete(void *data)
 {
-    WriteInputBufferInfo *info = (WriteInputBufferInfo*) data;
+    cl_int status;
+    WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
     cl_uint count = info->count;
     int vectorSize;
 
-    if( CL_SUCCESS != status )
-    {
-        vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
-        gFailCount++;
-        return;
-    }
-
     info->barrierCount = gMaxVectorSize - gMinVectorSize;
 
-    // now that we know that the write buffer is complete, enqueue callbacks to wait for the main thread to
-    // finish calculating the reference results.
-    for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    // now that we know that the write buffer is complete, enqueue callbacks to
+    // wait for the main thread to finish calculating the reference results.
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
-        size_t workItemCount = (count + vectorSizes[vectorSize] - 1) / ( vectorSizes[vectorSize]);
-        cl_event mapComplete = NULL;
+        size_t workItemCount =
+            (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);
 
-        if( (status = RunKernel( info->calcInfo[ vectorSize ].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
+        if ((status = RunKernel(info->calcInfo[vectorSize].kernel, gInBuffer,
+                                gOutBuffers[vectorSize], workItemCount)))
         {
             gFailCount++;
             return;
         }
 
-        info->calcInfo[vectorSize].p = clEnqueueMapBuffer( gQueue, gOutBuffers[ vectorSize ], CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
-                                                          0, count * gTypeSizes[ info->outType ], 0, NULL, &mapComplete, &status);
+        info->calcInfo[vectorSize].p = clEnqueueMapBuffer(
+            gQueue, gOutBuffers[vectorSize], CL_TRUE,
+            CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
+            NULL, NULL, &status);
         {
-            if( status )
+            if (status)
             {
-                vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
+                vlog_error("ERROR: WriteInputBufferComplete calback failed "
+                           "with status: %d\n",
+                           status);
                 gFailCount++;
                 return;
             }
         }
+    }
 
-        if( (status = clSetEventCallback( mapComplete, CL_COMPLETE, MapResultValuesComplete, info->calcInfo + vectorSize)))
-        {
-            vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
-            gFailCount++;
-            return;
-        }
-
-        if( (status = clReleaseEvent(mapComplete)))
-        {
-            vlog_error( "ERROR: clReleaseEvent calback failed in WriteInputBufferComplete for vector size %d with status: %d\n", vectorSize, status );
-            gFailCount++;
-            return;
-        }
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    {
+        MapResultValuesComplete(info->calcInfo + vectorSize);
     }
 
     // Make sure the work starts moving -- otherwise we may deadlock
-    if( (status = clFlush(gQueue)))
+    if ((status = clFlush(gQueue)))
     {
-        vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
+        vlog_error(
+            "ERROR: WriteInputBufferComplete calback failed with status: %d\n",
+            status);
         gFailCount++;
         return;
     }
 
-    // e was already released by the main thread. It should be destroyed automatically soon after we exit.
+    // e was already released by the main thread. It should be destroyed
+    // automatically soon after we exit.
 }
 
-void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data );
+void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
+                                             void *data);
 
 // Note: May be called reentrantly
-void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data )
+void MapResultValuesComplete(void *data)
 {
-    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data;
+    cl_int status;
+    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
     cl_event calcReferenceValues = info->parent->calcReferenceValues;
 
-    if( CL_SUCCESS != status )
+    // we know that the map is done, wait for the main thread to finish
+    // calculating the reference values
+    if ((status = clSetEventCallback(calcReferenceValues, CL_COMPLETE,
+                                     CalcReferenceValuesComplete, data)))
     {
-        vlog_error( "ERROR: MapResultValuesComplete calback failed with status: %d\n", status );
-        gFailCount++;       // not thread safe -- being lazy here
-        clReleaseEvent(calcReferenceValues);
-        return;
+        vlog_error("ERROR: clSetEventCallback failed in "
+                   "MapResultValuesComplete with status: %d\n",
+                   status);
+        gFailCount++; // not thread safe -- being lazy here
     }
 
-    // we know that the map is done, wait for the main thread to finish calculating the reference values
-    if( (status = clSetEventCallback( calcReferenceValues, CL_COMPLETE, CalcReferenceValuesComplete, data )))
+    // this thread no longer needs its reference to info->calcReferenceValues,
+    // so release it
+    if ((status = clReleaseEvent(calcReferenceValues)))
     {
-        vlog_error( "ERROR: clSetEventCallback failed in MapResultValuesComplete with status: %d\n", status );
-        gFailCount++;       // not thread safe -- being lazy here
-    }
-
-    // this thread no longer needs its reference to info->calcReferenceValues, so release it
-    if( (status = clReleaseEvent(calcReferenceValues) ))
-    {
-        vlog_error( "ERROR: clReleaseEvent(info->calcReferenceValues) failed with status: %d\n", status );
-        gFailCount++;       // not thread safe -- being lazy here
+        vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
+                   "with status: %d\n",
+                   status);
+        gFailCount++; // not thread safe -- being lazy here
     }
 
     // no need to flush since we didn't enqueue anything
 
-    // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after we exit.
+    // e was already released by WriteInputBufferComplete. It should be
+    // destroyed automatically soon after we exit.
 }
 
 
-void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data )
+void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
+                                             void *data)
 {
-    CalcReferenceValuesInfo     *info = (CalcReferenceValuesInfo*) data;
-    cl_uint                     vectorSize = info->vectorSize;
-    cl_uint                     count = info->parent->count;
-    Type                        outType = info->parent->outType;        // the data type of the conversion result
-    Type                        inType = info->parent->inType;          // the data type of the conversion input
-    size_t                      j;
-    cl_int                      error;
-    cl_event                    doneBarrier = info->parent->doneBarrier;
+    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
+    cl_uint vectorSize = info->vectorSize;
+    cl_uint count = info->parent->count;
+    Type outType =
+        info->parent->outType; // the data type of the conversion result
+    Type inType = info->parent->inType; // the data type of the conversion input
+    size_t j;
+    cl_int error;
+    cl_event doneBarrier = info->parent->doneBarrier;
 
     // report spurious error condition
-    if( CL_SUCCESS != status )
+    if (CL_SUCCESS != status)
     {
-        vlog_error( "ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", status );
-        gFailCount++;       // lazy about thread safety here
+        vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
+                   status);
+        gFailCount++; // lazy about thread safety here
         return;
     }
 
-    // Now we know that both results have been mapped back from the device, and the
-    // main thread is done calculating the reference results. It is now time to check
-    // the results.
+    // Now we know that both results have been mapped back from the device, and
+    // the main thread is done calculating the reference results. It is now time
+    // to check the results.
 
     // verify results
     void *mapped = info->p;
 
-    //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
-    if( outType != kfloat && outType != kdouble )
+    // Patch up NaNs conversions to integer to zero -- these can be converted to
+    // any integer
+    if (outType != kfloat && outType != kdouble)
     {
-        if( inType == kfloat )
+        if (inType == kfloat)
         {
-            float *inp = (float*) gIn;
-            for( j = 0; j < count; j++ )
+            float *inp = (float *)gIn;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) )
-                    memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
+                if (isnan(inp[j]))
+                    memset((char *)mapped + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
             }
         }
-        if( inType == kdouble )
+        if (inType == kdouble)
         {
-            double *inp = (double*) gIn;
-            for( j = 0; j < count; j++ )
+            double *inp = (double *)gIn;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) )
-                    memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
+                if (isnan(inp[j]))
+                    memset((char *)mapped + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
             }
         }
     }
-    else if( inType == kfloat || inType == kdouble )
-    {  // outtype and intype is float or double.  NaN conversions for float <-> double can be any NaN
-        if( inType == kfloat && outType == kdouble )
+    else if (inType == kfloat || inType == kdouble)
+    { // outtype and intype is float or double.  NaN conversions for float <->
+      // double can be any NaN
+        if (inType == kfloat && outType == kdouble)
         {
-            float *inp = (float*) gIn;
-            double *outp = (double*) mapped;
-            for( j = 0; j < count; j++ )
+            float *inp = (float *)gIn;
+            double *outp = (double *)mapped;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) && isnan(outp[j]) )
-                    outp[j] = NAN;
+                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
             }
         }
-        if( inType == kdouble && outType == kfloat )
+        if (inType == kdouble && outType == kfloat)
         {
-            double *inp = (double*) gIn;
-            float *outp = (float*) mapped;
-            for( j = 0; j < count; j++ )
+            double *inp = (double *)gIn;
+            float *outp = (float *)mapped;
+            for (j = 0; j < count; j++)
             {
-                if( isnan( inp[j] ) && isnan(outp[j]) )
-                    outp[j] = NAN;
+                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
             }
         }
     }
 
-    if( memcmp( mapped, gRef, count * gTypeSizes[ outType ] ) )
-        info->result = gCheckResults[outType]( mapped, gRef, gAllowZ, count, vectorSizes[vectorSize] );
+    if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
+        info->result = gCheckResults[outType](mapped, gRef, gAllowZ, count,
+                                              vectorSizes[vectorSize]);
     else
         info->result = 0;
 
     // Fill the output buffer with junk and release it
     {
-        cl_uint pattern =  0xffffdead;
+        cl_uint pattern = 0xffffdead;
         memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
-        if((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[ vectorSize ], mapped, 0, NULL, NULL)))
+        if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
+                                             mapped, 0, NULL, NULL)))
         {
-            vlog_error( "ERROR: clEnqueueUnmapMemObject failed in CalcReferenceValuesComplete  (%d)\n", error );
+            vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
+                       "CalcReferenceValuesComplete  (%d)\n",
+                       error);
             gFailCount++;
         }
     }
 
-    if( 1 == ThreadPool_AtomicAdd( &info->parent->barrierCount, -1) )
+    if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
     {
-        if( (status = clSetUserEventStatus( doneBarrier, CL_COMPLETE) ))
+        if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
         {
-            vlog_error( "ERROR: clSetUserEventStatus failed in CalcReferenceValuesComplete (err: %d). We're probably going to deadlock.\n", status );
+            vlog_error("ERROR: clSetUserEventStatus failed in "
+                       "CalcReferenceValuesComplete (err: %d). We're probably "
+                       "going to deadlock.\n",
+                       status);
             gFailCount++;
             return;
         }
 
-        if( (status = clReleaseEvent( doneBarrier ) ) )
+        if ((status = clReleaseEvent(doneBarrier)))
         {
-            vlog_error( "ERROR: clReleaseEvent failed in CalcReferenceValuesComplete (err: %d).\n", status );
+            vlog_error("ERROR: clReleaseEvent failed in "
+                       "CalcReferenceValuesComplete (err: %d).\n",
+                       status);
             gFailCount++;
             return;
         }
     }
-
-
-    // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after
-    // all the calls to CalcReferenceValuesComplete exit.
+    // e was already released by WriteInputBufferComplete. It should be
+    // destroyed automatically soon after all the calls to
+    // CalcReferenceValuesComplete exit.
 }
 
-static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel )
+static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
+                              RoundingMode round, int vectorSize,
+                              cl_kernel *outKernel)
 {
     cl_program program;
     char testName[256];
@@ -1559,7 +1635,8 @@ static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat,
     if (outType == kdouble || inType == kdouble)
         source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
 
-    // Create the program. This is a bit complicated because we are trying to avoid byte and short stores.
+    // Create the program. This is a bit complicated because we are trying to
+    // avoid byte and short stores.
     if (0 == vectorSize)
     {
         // Create the type names.
@@ -1590,27 +1667,35 @@ static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat,
         char outName[32];
         switch (vectorSizetmp)
         {
-        case 1:
-            strncpy(inName, gTypeNames[inType], sizeof(inName));
-            strncpy(outName, gTypeNames[outType], sizeof(outName));
-            snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
-            snprintf(testName, 256, "test_%s_%s", convertString, inName);
-            vlog("Building %s( %s ) test\n", convertString, inName);
-            break;
-        case 3:
-            strncpy(inName, gTypeNames[inType], sizeof(inName));
-            strncpy(outName, gTypeNames[outType], sizeof(outName));
-            snprintf(convertString, sizeof(convertString), "convert_%s3%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
-            snprintf(testName, 256, "test_%s_%s3", convertString, inName);
-            vlog("Building %s( %s3 ) test\n", convertString, inName);
-            break;
-        default:
-            snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType], vectorSizetmp);
-            snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType], vectorSizetmp);
-            snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
-            snprintf(testName, 256, "test_%s_%s", convertString, inName);
-            vlog("Building %s( %s ) test\n", convertString, inName);
-            break;
+            case 1:
+                strncpy(inName, gTypeNames[inType], sizeof(inName));
+                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
+                         outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s", convertString, inName);
+                vlog("Building %s( %s ) test\n", convertString, inName);
+                break;
+            case 3:
+                strncpy(inName, gTypeNames[inType], sizeof(inName));
+                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                snprintf(convertString, sizeof(convertString),
+                         "convert_%s3%s%s", outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s3", convertString, inName);
+                vlog("Building %s( %s3 ) test\n", convertString, inName);
+                break;
+            default:
+                snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
+                         vectorSizetmp);
+                snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
+                         vectorSizetmp);
+                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
+                         outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s", convertString, inName);
+                vlog("Building %s( %s ) test\n", convertString, inName);
+                break;
         }
         fflush(stdout);
 
@@ -1650,8 +1735,7 @@ static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat,
     *outKernel = NULL;
 
     const char *flags = NULL;
-    if( gForceFTZ )
-        flags = "-cl-denorms-are-zero";
+    if (gForceFTZ) flags = "-cl-denorms-are-zero";
 
     // build it
     std::string sourceString = source.str();