diff --git a/test_conformance/half/Test_vStoreHalf.cpp b/test_conformance/half/Test_vStoreHalf.cpp
index 591470f0..efaceaf7 100644
--- a/test_conformance/half/Test_vStoreHalf.cpp
+++ b/test_conformance/half/Test_vStoreHalf.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -70,8 +70,7 @@ typedef struct CheckResultInfoD_
     int vsz;
 } CheckResultInfoD;
 
-static cl_int
-ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
     cl_uint lim = cri->lim;
@@ -83,10 +82,10 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
     cl_ulong i = cri->i + off;
     cl_uint j;
 
-    if (off + count > lim)
-        count = lim - off;
+    if (off + count > lim) count = lim - off;
 
-    for (j = 0; j < count; ++j) {
+    for (j = 0; j < count; ++j)
+    {
         x[j] = as_float((cl_uint)(i + j));
         r[j] = f(x[j]);
     }
@@ -94,8 +93,7 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
     return 0;
 }
 
-static cl_int
-CheckF(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int CheckF(cl_uint jid, cl_uint tid, void *userInfo)
 {
     CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
     cl_uint lim = cri->lim;
@@ -106,33 +104,33 @@ CheckF(cl_uint jid, cl_uint tid, void *userInfo)
     const cl_ushort *s = cri->s + off;
     f2h f = cri->f;
     cl_uint j;
-    cl_ushort correct2 = f( 0.0f);
+    cl_ushort correct2 = f(0.0f);
     cl_ushort correct3 = f(-0.0f);
     cl_int ret = 0;
 
-    if (off + count > lim)
-        count = lim - off;
+    if (off + count > lim) count = lim - off;
 
-    if (!memcmp(r, s, count*sizeof(cl_ushort)))
-        return 0;
+    if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
 
-    for (j = 0; j < count; j++) {
+    for (j = 0; j < count; j++)
+    {
         if (s[j] == r[j]) continue;
 
         // Pass any NaNs
-        if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
-            continue;
+        if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
 
         // retry per section 6.5.3.3
         if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
             continue;
 
         // if reference result is subnormal, pass any zero
-        if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
+        if (gIsEmbedded && IsHalfSubnormal(r[j])
+            && (s[j] == 0x0000 || s[j] == 0x8000))
             continue;
 
-        vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x,  vector_size = %d, address_space = %s\n",
-                   j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
+        vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x,  "
+                   "vector_size = %d, address_space = %s\n",
+                   j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
 
         ret = 1;
         break;
@@ -141,8 +139,7 @@ CheckF(cl_uint jid, cl_uint tid, void *userInfo)
     return ret;
 }
 
-static cl_int
-ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
 {
     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
     cl_uint lim = cri->lim;
@@ -154,10 +151,10 @@ ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     cl_uint j;
     cl_ulong i = cri->i + off;
 
-    if (off + count > lim)
-        count = lim - off;
+    if (off + count > lim) count = lim - off;
 
-    for (j = 0; j < count; ++j) {
+    for (j = 0; j < count; ++j)
+    {
         x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
         r[j] = f(x[j]);
     }
@@ -165,8 +162,7 @@ ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
     return 0;
 }
 
-static cl_int
-CheckD(cl_uint jid, cl_uint tid, void *userInfo)
+static cl_int CheckD(cl_uint jid, cl_uint tid, void *userInfo)
 {
     CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
     cl_uint lim = cri->lim;
@@ -177,35 +173,35 @@ CheckD(cl_uint jid, cl_uint tid, void *userInfo)
     const cl_ushort *s = cri->s + off;
     d2h f = cri->f;
     cl_uint j;
-    cl_ushort correct2 = f( 0.0);
+    cl_ushort correct2 = f(0.0);
     cl_ushort correct3 = f(-0.0);
     cl_int ret = 0;
 
-    if (off + count > lim)
-        count = lim - off;
+    if (off + count > lim) count = lim - off;
 
-    if (!memcmp(r, s, count*sizeof(cl_ushort)))
-        return 0;
+    if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
 
-    for (j = 0; j < count; j++) {
+    for (j = 0; j < count; j++)
+    {
         if (s[j] == r[j]) continue;
 
         // Pass any NaNs
-        if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
-            continue;
+        if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
 
         if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
             continue;
 
         // if reference result is subnormal, pass any zero result
-        if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
+        if (gIsEmbedded && IsHalfSubnormal(r[j])
+            && (s[j] == 0x0000 || s[j] == 0x8000))
             continue;
 
-        vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n",
-                   j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
+        vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, "
+                   "vector_size = %d, address space = %s (double precision)\n",
+                   j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
 
         ret = 1;
-    break;
+        break;
     }
 
     return ret;
@@ -251,100 +247,129 @@ static cl_half double2half_rtn(double f)
     return cl_half_from_double(f, CL_HALF_RTN);
 }
 
-int test_vstore_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstore_half(cl_device_id deviceID, cl_context context,
+                     cl_command_queue queue, int num_elements)
 {
     switch (get_default_rounding_mode(deviceID))
     {
         case CL_FP_ROUND_TO_ZERO:
-            return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rte, "");
-        case 0:
-            return -1;
+            return Test_vStoreHalf_private(deviceID, float2half_rtz,
+                                           double2half_rte, "");
+        case 0: return -1;
         default:
-            return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "");
+            return Test_vStoreHalf_private(deviceID, float2half_rte,
+                                           double2half_rte, "");
     }
 }
 
-int test_vstore_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstore_half_rte(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
+    return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte,
+                                   "_rte");
 }
 
-int test_vstore_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstore_half_rtz(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
+    return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz,
+                                   "_rtz");
 }
 
-int test_vstore_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstore_half_rtp(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
+    return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp,
+                                   "_rtp");
 }
 
-int test_vstore_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstore_half_rtn(cl_device_id deviceID, cl_context context,
+                         cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
+    return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn,
+                                   "_rtn");
 }
 
-int test_vstorea_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstorea_half(cl_device_id deviceID, cl_context context,
+                      cl_command_queue queue, int num_elements)
 {
     switch (get_default_rounding_mode(deviceID))
     {
         case CL_FP_ROUND_TO_ZERO:
-            return Test_vStoreaHalf_private(deviceID,float2half_rtz, double2half_rte, "");
-        case 0:
-            return -1;
+            return Test_vStoreaHalf_private(deviceID, float2half_rtz,
+                                            double2half_rte, "");
+        case 0: return -1;
         default:
-            return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "");
+            return Test_vStoreaHalf_private(deviceID, float2half_rte,
+                                            double2half_rte, "");
     }
 }
 
-int test_vstorea_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstorea_half_rte(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
+    return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte,
+                                    "_rte");
 }
 
-int test_vstorea_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstorea_half_rtz(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
+    return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz,
+                                    "_rtz");
 }
 
-int test_vstorea_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstorea_half_rtp(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
+    return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp,
+                                    "_rtp");
 }
 
-int test_vstorea_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+int test_vstorea_half_rtn(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, int num_elements)
 {
-    return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
+    return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn,
+                                    "_rtn");
 }
 
 #pragma mark -
 
-int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
+int Test_vStoreHalf_private(cl_device_id device, f2h referenceFunc,
+                            d2h doubleReferenceFunc, const char *roundName)
 {
     int vectorSize, error;
-    cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
-    cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
+    cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    cl_program resetProgram;
+    cl_kernel resetKernel;
 
-    uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    memset( min_time, -1, sizeof( min_time ) );
-    cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
-    cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
-    uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    memset( min_double_time, -1, sizeof( min_double_time ) );
+    uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
+    uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
+    memset(min_time, -1, sizeof(min_time));
+    cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
+    uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
+        0
+    };
+    memset(min_double_time, -1, sizeof(min_double_time));
 
-    bool aligned= false;
+    bool aligned = false;
 
-    for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+    for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
+         vectorSize++)
     {
-        const char *source[] = {
-            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
-            "}\n"
-        };
+        const char *source[] = { "__kernel void test( __global float",
+                                 vector_size_name_extensions[vectorSize],
+                                 " *p, __global half *f )\n"
+                                 "{\n"
+                                 "   size_t i = get_global_id(0);\n"
+                                 "   vstore_half",
+                                 vector_size_name_extensions[vectorSize],
+                                 roundName,
+                                 "( p[i], i, f );\n"
+                                 "}\n" };
 
         const char *source_v3[] = {
             "__kernel void test( __global float *p, __global half *f,\n"
@@ -356,21 +381,29 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
             "   if(last_i == i && extra_last_thread != 0) {\n"
             "     adjust = 3-extra_last_thread;\n"
             "   } "
-            "   vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n"
+            "   vstore_half3",
+            roundName,
+            "( vload3(i, p-adjust), i, f-adjust );\n"
             "}\n"
         };
 
         const char *source_private_store[] = {
-            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global float",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
             "   __private ushort data[16];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t offset = 0;\n"
             "   size_t vecsize = vec_step(p[i]);\n"
-            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
+            "   vstore_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( p[i], 0, (__private half *)(&data[0]) );\n"
             "   for(offset = 0; offset < vecsize; offset++)\n"
             "   {\n"
-            "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
+            "       vstore_half(vload_half(offset, (__private half *)data), 0, "
+            "&f[vecsize*i+offset]);\n"
             "   }\n"
             "}\n"
         };
@@ -388,10 +421,13 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
             "   if(last_i == i && extra_last_thread != 0) {\n"
             "     adjust = 3-extra_last_thread;\n"
             "   } "
-            "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
+            "   vstore_half3",
+            roundName,
+            "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
             "   for(offset = 0; offset < 3; offset++)\n"
             "   {\n"
-            "       vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n"
+            "       vstore_half(vload_half(offset, (__private half *) data), "
+            "0, &f[3*i+offset-adjust]);\n"
             "   }\n"
             "}\n"
         };
@@ -401,17 +437,26 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
 
 
         const char *source_local_store[] = {
-            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global float",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __local ushort data[16*", local_buf_size, "];\n"
+            "   __local ushort data[16*",
+            local_buf_size,
+            "];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t lid = get_local_id(0);\n"
             "   size_t lsize = get_local_size(0);\n"
             "   size_t vecsize = vec_step(p[0]);\n"
             "   event_t async_event;\n"
-            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
+            "   vstore_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( p[i], lid, (__local half *)(&data[0]) );\n"
             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
-            "   async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
+            "   async_event = async_work_group_copy((__global ushort "
+            "*)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, "
+            "0);\n" // investigate later
             "   wait_group_events(1, &async_event);\n"
             "}\n"
         };
@@ -450,25 +495,36 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
 
         const char *double_source[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
-            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
+            "   vstore_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( p[i], i, f );\n"
             "}\n"
         };
 
         const char *double_source_private_store[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
             "   __private ushort data[16];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t offset = 0;\n"
             "   size_t vecsize = vec_step(p[i]);\n"
-            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
+            "   vstore_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( p[i], 0, (__private half *)(&data[0]) );\n"
             "   for(offset = 0; offset < vecsize; offset++)\n"
             "   {\n"
-            "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
+            "       vstore_half(vload_half(offset, (__private half *)data), 0, "
+            "&f[vecsize*i+offset]);\n"
             "   }\n"
             "}\n"
         };
@@ -476,17 +532,26 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
 
         const char *double_source_local_store[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __local ushort data[16*", local_buf_size, "];\n"
+            "   __local ushort data[16*",
+            local_buf_size,
+            "];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t lid = get_local_id(0);\n"
             "   size_t vecsize = vec_step(p[0]);\n"
             "   size_t lsize = get_local_size(0);\n"
             "   event_t async_event;\n"
-            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
+            "   vstore_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( p[i], lid, (__local half *)(&data[0]) );\n"
             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
-            "   async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
+            "   async_event = async_work_group_copy((__global ushort "
+            "*)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), "
+            "vecsize*lsize, 0);\n" // investigate later
             "   wait_group_events(1, &async_event);\n"
             "}\n"
         };
@@ -503,7 +568,9 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
             "   if(last_i == i && extra_last_thread != 0) {\n"
             "     adjust = 3-extra_last_thread;\n"
             "   } "
-            "   vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n"
+            "   vstore_half3",
+            roundName,
+            "( vload3(i,p-adjust), i, f -adjust);\n"
             "}\n"
         };
 
@@ -520,10 +587,13 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
             "   if(last_i == i && extra_last_thread != 0) {\n"
             "     adjust = 3-extra_last_thread;\n"
             "   } "
-            "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
+            "   vstore_half3",
+            roundName,
+            "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
             "   for(offset = 0; offset < 3; offset++)\n"
             "   {\n"
-            "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n"
+            "       vstore_half(vload_half(offset, (__private half *)data), 0, "
+            "&f[3*i+offset-adjust]);\n"
             "   }\n"
             "}\n"
         };
@@ -562,151 +632,235 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
         };
 
 
-        if(g_arrVecSizes[vectorSize] == 3) {
-            programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
-        } else {
-            programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
+        if (g_arrVecSizes[vectorSize] == 3)
+        {
+            programs[vectorSize][0] = MakeProgram(
+                device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
         }
-        if( NULL == programs[ vectorSize ][0] )
+        else
+        {
+            programs[vectorSize][0] =
+                MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
+        }
+        if (NULL == programs[vectorSize][0])
         {
             gFailCount++;
             return -1;
         }
 
-        kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
-        if( NULL == kernels[vectorSize][0] )
+        kernels[vectorSize][0] =
+            clCreateKernel(programs[vectorSize][0], "test", &error);
+        if (NULL == kernels[vectorSize][0])
         {
             gFailCount++;
-            vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
+            vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
             return error;
         }
 
-        if(g_arrVecSizes[vectorSize] == 3) {
-            programs[vectorSize][1] = MakeProgram( device, source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) );
-        } else {
-            programs[vectorSize][1] = MakeProgram( device, source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) );
+        if (g_arrVecSizes[vectorSize] == 3)
+        {
+            programs[vectorSize][1] =
+                MakeProgram(device, source_private_store_v3,
+                            sizeof(source_private_store_v3)
+                                / sizeof(source_private_store_v3[0]));
         }
-        if( NULL == programs[ vectorSize ][1] )
+        else
+        {
+            programs[vectorSize][1] = MakeProgram(
+                device, source_private_store,
+                sizeof(source_private_store) / sizeof(source_private_store[0]));
+        }
+        if (NULL == programs[vectorSize][1])
         {
             gFailCount++;
             return -1;
         }
 
-        kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
-        if( NULL == kernels[vectorSize][1] )
+        kernels[vectorSize][1] =
+            clCreateKernel(programs[vectorSize][1], "test", &error);
+        if (NULL == kernels[vectorSize][1])
         {
             gFailCount++;
-            vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
+            vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
+                       error);
             return error;
         }
 
-        if(g_arrVecSizes[vectorSize] == 3) {
-            programs[vectorSize][2] = MakeProgram( device, source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) );
-            if(  NULL == programs[ vectorSize ][2] )
+        if (g_arrVecSizes[vectorSize] == 3)
+        {
+            programs[vectorSize][2] =
+                MakeProgram(device, source_local_store_v3,
+                            sizeof(source_local_store_v3)
+                                / sizeof(source_local_store_v3[0]));
+            if (NULL == programs[vectorSize][2])
             {
                 unsigned q;
-                for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++)
+                for (q = 0; q < sizeof(source_local_store_v3)
+                         / sizeof(source_local_store_v3[0]);
+                     q++)
                     vlog_error("%s", source_local_store_v3[q]);
 
                 gFailCount++;
                 return -1;
-
             }
-        } else {
-            programs[vectorSize][2] = MakeProgram( device, source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) );
-            if( NULL == programs[ vectorSize ][2] )
+        }
+        else
+        {
+            programs[vectorSize][2] = MakeProgram(
+                device, source_local_store,
+                sizeof(source_local_store) / sizeof(source_local_store[0]));
+            if (NULL == programs[vectorSize][2])
             {
                 unsigned q;
-                for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++)
+                for (q = 0; q < sizeof(source_local_store)
+                         / sizeof(source_local_store[0]);
+                     q++)
                     vlog_error("%s", source_local_store[q]);
 
                 gFailCount++;
                 return -1;
-
             }
         }
 
-        kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
-        if( NULL == kernels[vectorSize][2] )
+        kernels[vectorSize][2] =
+            clCreateKernel(programs[vectorSize][2], "test", &error);
+        if (NULL == kernels[vectorSize][2])
         {
             gFailCount++;
-            vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
+            vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
+                       error);
             return error;
         }
 
-        if( gTestDouble )
+        if (gTestDouble)
         {
-            if(g_arrVecSizes[vectorSize] == 3) {
-                doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
-            } else {
-                doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
-            }
-            if( NULL == doublePrograms[ vectorSize ][0] )
+            if (g_arrVecSizes[vectorSize] == 3)
             {
-                gFailCount++;
-                return -1;
+                doublePrograms[vectorSize][0] = MakeProgram(
+                    device, double_source_v3,
+                    sizeof(double_source_v3) / sizeof(double_source_v3[0]));
             }
-
-            doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
-            if( NULL == kernels[vectorSize][0] )
-            {
-                gFailCount++;
-                vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
-                return error;
-            }
-
-            if(g_arrVecSizes[vectorSize] == 3)
-                doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) );
             else
-                doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) );
-
-            if( NULL == doublePrograms[ vectorSize ][1] )
+            {
+                doublePrograms[vectorSize][0] = MakeProgram(
+                    device, double_source,
+                    sizeof(double_source) / sizeof(double_source[0]));
+            }
+            if (NULL == doublePrograms[vectorSize][0])
             {
                 gFailCount++;
                 return -1;
             }
 
-            doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
-            if( NULL == kernels[vectorSize][1] )
+            doubleKernels[vectorSize][0] =
+                clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
+            if (NULL == kernels[vectorSize][0])
             {
                 gFailCount++;
-                vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
+                vlog_error(
+                    "\t\tFAILED -- Failed to create double kernel. (%d)\n",
+                    error);
                 return error;
             }
 
-            if(g_arrVecSizes[vectorSize] == 3) {
-                doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) );
-            } else {
-                doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) );
-            }
-            if( NULL == doublePrograms[ vectorSize ][2] )
+            if (g_arrVecSizes[vectorSize] == 3)
+                doublePrograms[vectorSize][1] = MakeProgram(
+                    device, double_source_private_store_v3,
+                    sizeof(double_source_private_store_v3)
+                        / sizeof(double_source_private_store_v3[0]));
+            else
+                doublePrograms[vectorSize][1] =
+                    MakeProgram(device, double_source_private_store,
+                                sizeof(double_source_private_store)
+                                    / sizeof(double_source_private_store[0]));
+
+            if (NULL == doublePrograms[vectorSize][1])
             {
                 gFailCount++;
                 return -1;
             }
 
-            doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
-            if( NULL == kernels[vectorSize][2] )
+            doubleKernels[vectorSize][1] =
+                clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
+            if (NULL == kernels[vectorSize][1])
             {
                 gFailCount++;
-                vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
+                vlog_error("\t\tFAILED -- Failed to create double private "
+                           "kernel. (%d)\n",
+                           error);
+                return error;
+            }
+
+            if (g_arrVecSizes[vectorSize] == 3)
+            {
+                doublePrograms[vectorSize][2] =
+                    MakeProgram(device, double_source_local_store_v3,
+                                sizeof(double_source_local_store_v3)
+                                    / sizeof(double_source_local_store_v3[0]));
+            }
+            else
+            {
+                doublePrograms[vectorSize][2] =
+                    MakeProgram(device, double_source_local_store,
+                                sizeof(double_source_local_store)
+                                    / sizeof(double_source_local_store[0]));
+            }
+            if (NULL == doublePrograms[vectorSize][2])
+            {
+                gFailCount++;
+                return -1;
+            }
+
+            doubleKernels[vectorSize][2] =
+                clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
+            if (NULL == kernels[vectorSize][2])
+            {
+                gFailCount++;
+                vlog_error("\t\tFAILED -- Failed to create double local "
+                           "kernel. (%d)\n",
+                           error);
                 return error;
             }
         }
     } // end for vector size
 
+    const char *reset[] = {
+        "__kernel void reset( __global float *p, __global ushort *f,\n"
+        "                   uint extra_last_thread)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   *(f + i) = 0xdead;"
+        "}\n"
+    };
+
+    if (!gHostReset)
+    {
+        resetProgram =
+            MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
+        if (NULL == resetProgram)
+        {
+            gFailCount++;
+            return -1;
+        }
+        resetKernel = clCreateKernel(resetProgram, "reset", &error);
+        if (NULL == resetKernel)
+        {
+            gFailCount++;
+            return -1;
+        }
+    }
+
     // Figure out how many elements are in a work block
     size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
     size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
-    uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
+    uint64_t lastCase = 1ULL << (8 * sizeof(float)); // number of floats.
     size_t stride = blockCount;
 
     if (gWimpyMode)
         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
 
     // we handle 64-bit types a bit differently.
-    if( lastCase == 0 )
-        lastCase = 0x100000000ULL;
+    if (lastCase == 0) lastCase = 0x100000000ULL;
 
     uint64_t i, j;
     error = 0;
@@ -746,7 +900,7 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
     dchk.lim = blockCount;
     dchk.count = (blockCount + threadCount - 1) / threadCount;
 
-    for( i = 0; i < lastCase; i += stride )
+    for (i = 0; i < lastCase; i += stride)
     {
         count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
         fref.i = i;
@@ -755,50 +909,71 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
         // Compute the input and reference
         ThreadPool_Do(ReferenceF, threadCount, &fref);
 
-        error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
-        if (error) {
-            vlog_error( "Failure in clWriteBuffer\n" );
+        error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
+                                     count * sizeof(float), gIn_single, 0, NULL,
+                                     NULL);
+        if (error)
+        {
+            vlog_error("Failure in clWriteBuffer\n");
             gFailCount++;
             goto exit;
         }
 
-        if (gTestDouble) {
+        if (gTestDouble)
+        {
             ThreadPool_Do(ReferenceD, threadCount, &dref);
 
-            error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
-            if (error) {
-                vlog_error( "Failure in clWriteBuffer\n" );
+            error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
+                                         count * sizeof(double), gIn_double, 0,
+                                         NULL, NULL);
+            if (error)
+            {
+                vlog_error("Failure in clWriteBuffer\n");
                 gFailCount++;
                 goto exit;
             }
         }
 
-        for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
+        for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
+        {
             // Loop through vector sizes
             fchk.vsz = g_arrVecSizes[vectorSize];
             dchk.vsz = g_arrVecSizes[vectorSize];
 
-            for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
+            for (addressSpace = 0; addressSpace < 3; addressSpace++)
+            {
                 // Loop over address spaces
                 fchk.aspace = addressSpaceNames[addressSpace];
                 dchk.aspace = addressSpaceNames[addressSpace];
 
-                cl_uint pattern = 0xdeaddead;
-                memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
+                if (!gHostReset)
+                {
+                    error = RunKernel(device, resetKernel, gInBuffer_single,
+                                      gOutBuffer_half, count, 0);
+                }
+                else
+                {
+                    cl_uint pattern = 0xdeaddead;
+                    memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
 
-                error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
-                                             0, count * sizeof(cl_half),
-                                             gOut_half, 0, NULL, NULL);
-                if (error) {
-                    vlog_error( "Failure in clWriteArray\n" );
+                    error = clEnqueueWriteBuffer(
+                        gQueue, gOutBuffer_half, CL_FALSE, 0,
+                        count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
+                }
+                if (error)
+                {
+                    vlog_error("Failure in clWriteArray\n");
                     gFailCount++;
                     goto exit;
                 }
 
-                error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
-                                       numVecs(count, vectorSize, aligned) ,
+                error = RunKernel(device, kernels[vectorSize][addressSpace],
+                                  gInBuffer_single, gOutBuffer_half,
+                                  numVecs(count, vectorSize, aligned),
                                   runsOverBy(count, vectorSize, aligned));
-                if (error) {
+                if (error)
+                {
                     gFailCount++;
                     goto exit;
                 }
@@ -806,34 +981,51 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
                                             count * sizeof(cl_half), gOut_half,
                                             0, NULL, NULL);
-                if (error) {
-                    vlog_error( "Failure in clReadArray\n" );
+                if (error)
+                {
+                    vlog_error("Failure in clReadArray\n");
                     gFailCount++;
                     goto exit;
                 }
 
                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
-                if (error) {
-                            gFailCount++;
-                            goto exit;
-                        }
+                if (error)
+                {
+                    gFailCount++;
+                    goto exit;
+                }
 
-                if (gTestDouble) {
-                    memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
+                if (gTestDouble)
+                {
 
-                    error = clEnqueueWriteBuffer(
-                        gQueue, gOutBuffer_half, CL_FALSE, 0,
-                        count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
-                    if (error) {
-                        vlog_error( "Failure in clWriteArray\n" );
+                    if (!gHostReset)
+                    {
+                        error = RunKernel(device, resetKernel, gInBuffer_double,
+                                          gOutBuffer_half, count, 0);
+                    }
+                    else
+                    {
+                        cl_uint pattern = 0xdeaddead;
+                        memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
+
+                        error = clEnqueueWriteBuffer(
+                            gQueue, gOutBuffer_half, CL_FALSE, 0,
+                            count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
+                    }
+                    if (error)
+                    {
+                        vlog_error("Failure in clWriteArray\n");
                         gFailCount++;
                         goto exit;
                     }
 
-                    error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
+                    error = RunKernel(device,
+                                      doubleKernels[vectorSize][addressSpace],
+                                      gInBuffer_double, gOutBuffer_half,
                                       numVecs(count, vectorSize, aligned),
                                       runsOverBy(count, vectorSize, aligned));
-                    if (error) {
+                    if (error)
+                    {
                         gFailCount++;
                         goto exit;
                     }
@@ -841,148 +1033,185 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR
                     error = clEnqueueReadBuffer(
                         gQueue, gOutBuffer_half, CL_TRUE, 0,
                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
-                    if (error) {
-                        vlog_error( "Failure in clReadArray\n" );
+                    if (error)
+                    {
+                        vlog_error("Failure in clReadArray\n");
                         gFailCount++;
                         goto exit;
                     }
 
+
                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
-                    if (error) {
-                                gFailCount++;
-                                goto exit;
-                            }
-                        }
+                    if (error)
+                    {
+                        gFailCount++;
+                        goto exit;
                     }
                 }
-
-        if( ((i+blockCount) & ~printMask) == (i+blockCount) )
-        {
-            vlog( "." );
-            fflush( stdout );
+            }
         }
-    }  // end last case
+
+        if (((i + blockCount) & ~printMask) == (i + blockCount))
+        {
+            vlog(".");
+            fflush(stdout);
+        }
+    } // end last case
 
     loopCount = count == blockCount ? 1 : 100;
-    if( gReportTimes )
+    if (gReportTimes)
     {
-        //Init the input stream
+        // Init the input stream
         cl_float *p = (cl_float *)gIn_single;
-        for( j = 0; j < count; j++ )
-            p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
+        for (j = 0; j < count; j++)
+            p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
+                                          count * sizeof(float), gIn_single, 0,
+                                          NULL, NULL)))
         {
-            vlog_error( "Failure in clWriteArray\n" );
+            vlog_error("Failure in clWriteArray\n");
             gFailCount++;
             goto exit;
         }
 
-        if( gTestDouble )
+        if (gTestDouble)
         {
-            //Init the input stream
+            // Init the input stream
             cl_double *q = (cl_double *)gIn_double;
-            for( j = 0; j < count; j++ )
-                q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
+            for (j = 0; j < count; j++)
+                q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
 
-            if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
+            if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
+                                              0, count * sizeof(double),
+                                              gIn_double, 0, NULL, NULL)))
             {
-                vlog_error( "Failure in clWriteArray\n" );
+                vlog_error("Failure in clWriteArray\n");
                 gFailCount++;
                 goto exit;
             }
         }
 
-        //Run again for timing
-        for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+        // Run again for timing
+        for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
         {
             uint64_t bestTime = -1ULL;
-            for( j = 0; j < loopCount; j++ )
+            for (j = 0; j < loopCount; j++)
             {
                 uint64_t startTime = ReadTime();
 
 
-                if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
-                                       runsOverBy(count, vectorSize, aligned)) ) )
+                if ((error = RunKernel(device, kernels[vectorSize][0],
+                                       gInBuffer_single, gOutBuffer_half,
+                                       numVecs(count, vectorSize, aligned),
+                                       runsOverBy(count, vectorSize, aligned))))
                 {
                     gFailCount++;
                     goto exit;
                 }
 
-                if( (error = clFinish(gQueue)) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Failure in clFinish\n" );
+                    vlog_error("Failure in clFinish\n");
                     gFailCount++;
                     goto exit;
                 }
                 uint64_t currentTime = ReadTime() - startTime;
-                if( currentTime < bestTime )
-                    bestTime = currentTime;
-                time[ vectorSize ] += currentTime;
+                if (currentTime < bestTime) bestTime = currentTime;
+                time[vectorSize] += currentTime;
             }
-            if( bestTime < min_time[ vectorSize ] )
-                min_time[ vectorSize ] = bestTime ;
+            if (bestTime < min_time[vectorSize])
+                min_time[vectorSize] = bestTime;
 
-            if( gTestDouble )
+            if (gTestDouble)
             {
                 bestTime = -1ULL;
-                for( j = 0; j < loopCount; j++ )
+                for (j = 0; j < loopCount; j++)
                 {
                     uint64_t startTime = ReadTime();
-                    if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
-                                           runsOverBy(count, vectorSize, aligned)) ) )
+                    if ((error =
+                             RunKernel(device, doubleKernels[vectorSize][0],
+                                       gInBuffer_double, gOutBuffer_half,
+                                       numVecs(count, vectorSize, aligned),
+                                       runsOverBy(count, vectorSize, aligned))))
                     {
                         gFailCount++;
                         goto exit;
                     }
 
-                    if( (error = clFinish(gQueue)) )
+                    if ((error = clFinish(gQueue)))
                     {
-                        vlog_error( "Failure in clFinish\n" );
+                        vlog_error("Failure in clFinish\n");
                         gFailCount++;
                         goto exit;
                     }
                     uint64_t currentTime = ReadTime() - startTime;
-                    if( currentTime < bestTime )
-                        bestTime = currentTime;
-                    doubleTime[ vectorSize ] += currentTime;
+                    if (currentTime < bestTime) bestTime = currentTime;
+                    doubleTime[vectorSize] += currentTime;
                 }
-                if( bestTime < min_double_time[ vectorSize ] )
-                    min_double_time[ vectorSize ] = bestTime;
+                if (bestTime < min_double_time[vectorSize])
+                    min_double_time[vectorSize] = bestTime;
             }
         }
     }
 
-    if( gReportTimes )
+    if (gReportTimes)
     {
-        for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-            vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
-                      "average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
-        for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-            vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
-                      "best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
-        if( gTestDouble )
+        for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
+            vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
+                          * gComputeDevices / (double)(count * loopCount),
+                      0, "average us/elem",
+                      "vStoreHalf%s avg. (%s vector size: %d)", roundName,
+                      addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
+        for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
+            vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
+                          * gDeviceFrequency * gComputeDevices / (double)count,
+                      0, "best us/elem",
+                      "vStoreHalf%s best (%s vector size: %d)", roundName,
+                      addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
+        if (gTestDouble)
         {
-            for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-                vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
-                          "average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0],  (g_arrVecSizes[vectorSize])  );
-            for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-                vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
-                          "best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
+            for (vectorSize = kMinVectorSize;
+                 vectorSize < kLastVectorSizeToTest; vectorSize++)
+                vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
+                              * gDeviceFrequency * gComputeDevices
+                              / (double)(count * loopCount),
+                          0, "average us/elem (double)",
+                          "vStoreHalf%s avg. d (%s vector size: %d)", roundName,
+                          addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
+            for (vectorSize = kMinVectorSize;
+                 vectorSize < kLastVectorSizeToTest; vectorSize++)
+                vlog_perf(SubtractTime(min_double_time[vectorSize], 0) * 1e6
+                              * gDeviceFrequency * gComputeDevices
+                              / (double)count,
+                          0, "best us/elem (double)",
+                          "vStoreHalf%s best d (%s vector size: %d)", roundName,
+                          addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
         }
     }
 
 exit:
-    //clean up
-    for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+    // clean up
+    if (!gHostReset)
     {
-        for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
-            clReleaseKernel( kernels[ vectorSize ][ addressSpace ] );
-            clReleaseProgram( programs[ vectorSize ][ addressSpace ] );
-            if( gTestDouble )
+        clReleaseKernel(resetKernel);
+        clReleaseProgram(resetProgram);
+    }
+
+    for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
+         vectorSize++)
+    {
+        for (addressSpace = 0; addressSpace < 3; addressSpace++)
+        {
+            clReleaseKernel(kernels[vectorSize][addressSpace]);
+            clReleaseProgram(programs[vectorSize][addressSpace]);
+            if (gTestDouble)
             {
-                clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
-                clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
+                clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
+                clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
             }
         }
     }
@@ -990,321 +1219,495 @@ exit:
     return error;
 }
 
-int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
+int Test_vStoreaHalf_private(cl_device_id device, f2h referenceFunc,
+                             d2h doubleReferenceFunc, const char *roundName)
 {
     int vectorSize, error;
-    cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
-    cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
+    cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    cl_program resetProgram;
+    cl_kernel resetKernel;
 
-    uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    memset( min_time, -1, sizeof( min_time ) );
-    cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
-    cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
-    uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
-    memset( min_double_time, -1, sizeof( min_double_time ) );
+    uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
+    uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
+    memset(min_time, -1, sizeof(min_time));
+    cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
+    uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
+    uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
+        0
+    };
+    memset(min_double_time, -1, sizeof(min_double_time));
 
     bool aligned = true;
 
     int minVectorSize = kMinVectorSize;
     // There is no aligned scalar vstorea_half
-    if( 0 == minVectorSize )
-        minVectorSize = 1;
+    if (0 == minVectorSize) minVectorSize = 1;
 
-    //Loop over vector sizes
-    for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+    // Loop over vector sizes
+    for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+         vectorSize++)
     {
-        const char *source[] = {
-            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
-            "{\n"
-            "   size_t i = get_global_id(0);\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
-            "}\n"
-        };
+        const char *source[] = { "__kernel void test( __global float",
+                                 vector_size_name_extensions[vectorSize],
+                                 " *p, __global half *f )\n"
+                                 "{\n"
+                                 "   size_t i = get_global_id(0);\n"
+                                 "   vstorea_half",
+                                 vector_size_name_extensions[vectorSize],
+                                 roundName,
+                                 "( p[i], i, f );\n"
+                                 "}\n" };
 
         const char *source_v3[] = {
             "__kernel void test( __global float3 *p, __global half *f )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
-            "   vstorea_half3",roundName,"( p[i], i, f );\n"
-            "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
+            "   vstorea_half3",
+            roundName,
+            "( p[i], i, f );\n"
+            "   vstore_half",
+            roundName,
+            "( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
             "}\n"
         };
 
         const char *source_private[] = {
-            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global float",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
+            "   __private float",
+            vector_size_name_extensions[vectorSize],
+            " data;\n"
             "   size_t i = get_global_id(0);\n"
             "   data = p[i];\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( data, i, f );\n"
             "}\n"
         };
 
         const char *source_private_v3[] = {
             "__kernel void test( __global float3 *p, __global half *f )\n"
             "{\n"
-            "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
+            "   __private float",
+            vector_size_name_extensions[vectorSize],
+            " data;\n"
             "   size_t i = get_global_id(0);\n"
             "   data = p[i];\n"
-            "   vstorea_half3",roundName,"( data, i, f );\n"
-            "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
+            "   vstorea_half3",
+            roundName,
+            "( data, i, f );\n"
+            "   vstore_half",
+            roundName,
+            "( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
             "}\n"
         };
 
         char local_buf_size[10];
         sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
-        const char *source_local[] = {
-            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
-            "{\n"
-            "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
-            "   size_t i = get_global_id(0);\n"
-            "   size_t lid = get_local_id(0);\n"
-            "   data[lid] = p[i];\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
-            "}\n"
-        };
+        const char *source_local[] = { "__kernel void test( __global float",
+                                       vector_size_name_extensions[vectorSize],
+                                       " *p, __global half *f )\n"
+                                       "{\n"
+                                       "   __local float",
+                                       vector_size_name_extensions[vectorSize],
+                                       " data[",
+                                       local_buf_size,
+                                       "];\n"
+                                       "   size_t i = get_global_id(0);\n"
+                                       "   size_t lid = get_local_id(0);\n"
+                                       "   data[lid] = p[i];\n"
+                                       "   vstorea_half",
+                                       vector_size_name_extensions[vectorSize],
+                                       roundName,
+                                       "( data[lid], i, f );\n"
+                                       "}\n" };
 
         const char *source_local_v3[] = {
-            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global float",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
+            "   __local float",
+            vector_size_name_extensions[vectorSize],
+            " data[",
+            local_buf_size,
+            "];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t lid = get_local_id(0);\n"
             "   data[lid] = p[i];\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
-            "   vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( data[lid], i, f );\n"
+            "   vstore_half",
+            roundName,
+            "( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
             "}\n"
         };
 
         const char *double_source[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( p[i], i, f );\n"
             "}\n"
         };
 
         const char *double_source_v3[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
             "   size_t i = get_global_id(0);\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
-            "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( p[i], i, f );\n"
+            "   vstore_half",
+            roundName,
+            "( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
             "}\n"
         };
 
         const char *double_source_private[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
+            "   __private double",
+            vector_size_name_extensions[vectorSize],
+            " data;\n"
             "   size_t i = get_global_id(0);\n"
             "   data = p[i];\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( data, i, f );\n"
             "}\n"
         };
 
         const char *double_source_private_v3[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
+            "   __private double",
+            vector_size_name_extensions[vectorSize],
+            " data;\n"
             "   size_t i = get_global_id(0);\n"
             "   data = p[i];\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
-            "   vstore_half",roundName,"( ((__global  double *)p)[4*i+3], 4*i+3, f);\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( data, i, f );\n"
+            "   vstore_half",
+            roundName,
+            "( ((__global  double *)p)[4*i+3], 4*i+3, f);\n"
             "}\n"
         };
 
         const char *double_source_local[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
+            "   __local double",
+            vector_size_name_extensions[vectorSize],
+            " data[",
+            local_buf_size,
+            "];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t lid = get_local_id(0);\n"
             "   data[lid] = p[i];\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( data[lid], i, f );\n"
             "}\n"
         };
 
         const char *double_source_local_v3[] = {
             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
+            "__kernel void test( __global double",
+            vector_size_name_extensions[vectorSize],
+            " *p, __global half *f )\n"
             "{\n"
-            "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
+            "   __local double",
+            vector_size_name_extensions[vectorSize],
+            " data[",
+            local_buf_size,
+            "];\n"
             "   size_t i = get_global_id(0);\n"
             "   size_t lid = get_local_id(0);\n"
             "   data[lid] = p[i];\n"
-            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
-            "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
+            "   vstorea_half",
+            vector_size_name_extensions[vectorSize],
+            roundName,
+            "( data[lid], i, f );\n"
+            "   vstore_half",
+            roundName,
+            "( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
             "}\n"
         };
 
-        if(g_arrVecSizes[vectorSize] == 3) {
-            programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
-            if( NULL == programs[ vectorSize ][0] )
+        if (g_arrVecSizes[vectorSize] == 3)
+        {
+            programs[vectorSize][0] = MakeProgram(
+                device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
+            if (NULL == programs[vectorSize][0])
             {
                 gFailCount++;
                 return -1;
             }
-        } else {
-            programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
-            if( NULL == programs[ vectorSize ][0] )
+        }
+        else
+        {
+            programs[vectorSize][0] =
+                MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
+            if (NULL == programs[vectorSize][0])
             {
                 gFailCount++;
                 return -1;
             }
         }
 
-        kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
-        if( NULL == kernels[vectorSize][0] )
+        kernels[vectorSize][0] =
+            clCreateKernel(programs[vectorSize][0], "test", &error);
+        if (NULL == kernels[vectorSize][0])
         {
             gFailCount++;
-            vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
+            vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
             return error;
         }
 
-        if(g_arrVecSizes[vectorSize] == 3) {
-            programs[vectorSize][1] = MakeProgram( device, source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) );
-            if( NULL == programs[ vectorSize ][1] )
+        if (g_arrVecSizes[vectorSize] == 3)
+        {
+            programs[vectorSize][1] = MakeProgram(
+                device, source_private_v3,
+                sizeof(source_private_v3) / sizeof(source_private_v3[0]));
+            if (NULL == programs[vectorSize][1])
             {
                 gFailCount++;
                 return -1;
             }
-        } else {
-            programs[vectorSize][1] = MakeProgram( device, source_private, sizeof(source_private) / sizeof( source_private[0]) );
-            if( NULL == programs[ vectorSize ][1] )
+        }
+        else
+        {
+            programs[vectorSize][1] =
+                MakeProgram(device, source_private,
+                            sizeof(source_private) / sizeof(source_private[0]));
+            if (NULL == programs[vectorSize][1])
             {
                 gFailCount++;
                 return -1;
             }
         }
 
-        kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
-        if( NULL == kernels[vectorSize][1] )
+        kernels[vectorSize][1] =
+            clCreateKernel(programs[vectorSize][1], "test", &error);
+        if (NULL == kernels[vectorSize][1])
         {
             gFailCount++;
-            vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
+            vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
+                       error);
             return error;
         }
 
-        if(g_arrVecSizes[vectorSize] == 3) {
-            programs[vectorSize][2] = MakeProgram( device, source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) );
-            if( NULL == programs[ vectorSize ][2] )
+        if (g_arrVecSizes[vectorSize] == 3)
+        {
+            programs[vectorSize][2] = MakeProgram(
+                device, source_local_v3,
+                sizeof(source_local_v3) / sizeof(source_local_v3[0]));
+            if (NULL == programs[vectorSize][2])
             {
                 gFailCount++;
                 return -1;
             }
-        } else {
-            programs[vectorSize][2] = MakeProgram( device, source_local, sizeof(source_local) / sizeof( source_local[0]) );
-            if( NULL == programs[ vectorSize ][2] )
+        }
+        else
+        {
+            programs[vectorSize][2] =
+                MakeProgram(device, source_local,
+                            sizeof(source_local) / sizeof(source_local[0]));
+            if (NULL == programs[vectorSize][2])
             {
                 gFailCount++;
                 return -1;
             }
         }
 
-        kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
-        if( NULL == kernels[vectorSize][2] )
+        kernels[vectorSize][2] =
+            clCreateKernel(programs[vectorSize][2], "test", &error);
+        if (NULL == kernels[vectorSize][2])
         {
             gFailCount++;
-            vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
+            vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
+                       error);
             return error;
         }
 
-        if( gTestDouble )
+        if (gTestDouble)
         {
-            if(g_arrVecSizes[vectorSize] == 3) {
-                doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
-                if( NULL == doublePrograms[ vectorSize ][0] )
+            if (g_arrVecSizes[vectorSize] == 3)
+            {
+                doublePrograms[vectorSize][0] = MakeProgram(
+                    device, double_source_v3,
+                    sizeof(double_source_v3) / sizeof(double_source_v3[0]));
+                if (NULL == doublePrograms[vectorSize][0])
                 {
                     gFailCount++;
                     return -1;
                 }
-            } else {
-                doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
-                if( NULL == doublePrograms[ vectorSize ][0] )
+            }
+            else
+            {
+                doublePrograms[vectorSize][0] = MakeProgram(
+                    device, double_source,
+                    sizeof(double_source) / sizeof(double_source[0]));
+                if (NULL == doublePrograms[vectorSize][0])
                 {
                     gFailCount++;
                     return -1;
                 }
             }
 
-            doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
-            if( NULL == kernels[vectorSize][0] )
+            doubleKernels[vectorSize][0] =
+                clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
+            if (NULL == kernels[vectorSize][0])
             {
                 gFailCount++;
-                vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
+                vlog_error(
+                    "\t\tFAILED -- Failed to create double kernel. (%d)\n",
+                    error);
                 return error;
             }
 
-            if(g_arrVecSizes[vectorSize] == 3) {
-                doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) );
-                if( NULL == doublePrograms[ vectorSize ][1] )
+            if (g_arrVecSizes[vectorSize] == 3)
+            {
+                doublePrograms[vectorSize][1] =
+                    MakeProgram(device, double_source_private_v3,
+                                sizeof(double_source_private_v3)
+                                    / sizeof(double_source_private_v3[0]));
+                if (NULL == doublePrograms[vectorSize][1])
                 {
                     gFailCount++;
                     return -1;
                 }
-            } else {
-                doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) );
-                if( NULL == doublePrograms[ vectorSize ][1] )
+            }
+            else
+            {
+                doublePrograms[vectorSize][1] =
+                    MakeProgram(device, double_source_private,
+                                sizeof(double_source_private)
+                                    / sizeof(double_source_private[0]));
+                if (NULL == doublePrograms[vectorSize][1])
                 {
                     gFailCount++;
                     return -1;
                 }
             }
 
-            doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
-            if( NULL == kernels[vectorSize][1] )
+            doubleKernels[vectorSize][1] =
+                clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
+            if (NULL == kernels[vectorSize][1])
             {
                 gFailCount++;
-                vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
+                vlog_error("\t\tFAILED -- Failed to create double private "
+                           "kernel. (%d)\n",
+                           error);
                 return error;
             }
 
-            if(g_arrVecSizes[vectorSize] == 3) {
-                doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) );
-                if( NULL == doublePrograms[ vectorSize ][2] )
+            if (g_arrVecSizes[vectorSize] == 3)
+            {
+                doublePrograms[vectorSize][2] =
+                    MakeProgram(device, double_source_local_v3,
+                                sizeof(double_source_local_v3)
+                                    / sizeof(double_source_local_v3[0]));
+                if (NULL == doublePrograms[vectorSize][2])
                 {
                     gFailCount++;
                     return -1;
                 }
-            } else {
-                doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) );
-                if( NULL == doublePrograms[ vectorSize ][2] )
+            }
+            else
+            {
+                doublePrograms[vectorSize][2] =
+                    MakeProgram(device, double_source_local,
+                                sizeof(double_source_local)
+                                    / sizeof(double_source_local[0]));
+                if (NULL == doublePrograms[vectorSize][2])
                 {
                     gFailCount++;
                     return -1;
                 }
             }
 
-            doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
-            if( NULL == kernels[vectorSize][2] )
+            doubleKernels[vectorSize][2] =
+                clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
+            if (NULL == kernels[vectorSize][2])
             {
                 gFailCount++;
-                vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
+                vlog_error("\t\tFAILED -- Failed to create double local "
+                           "kernel. (%d)\n",
+                           error);
                 return error;
             }
         }
     }
 
+    const char *reset[] = {
+        "__kernel void reset( __global float *p, __global ushort *f,\n"
+        "                   uint extra_last_thread)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   *(f + i) = 0xdead;"
+        "}\n"
+    };
+
+    if (!gHostReset)
+    {
+        resetProgram =
+            MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
+        if (NULL == resetProgram)
+        {
+            gFailCount++;
+            return -1;
+        }
+        resetKernel = clCreateKernel(resetProgram, "reset", &error);
+        if (NULL == resetKernel)
+        {
+            gFailCount++;
+            return -1;
+        }
+    }
+
     // Figure out how many elements are in a work block
     size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
     size_t blockCount = BUFFER_SIZE / elementSize;
-    uint64_t lastCase = 1ULL << (8*sizeof(float));
+    uint64_t lastCase = 1ULL << (8 * sizeof(float));
     size_t stride = blockCount;
 
     if (gWimpyMode)
         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
 
     // we handle 64-bit types a bit differently.
-    if( lastCase == 0 )
-        lastCase = 0x100000000ULL;
+    if (lastCase == 0) lastCase = 0x100000000ULL;
     uint64_t i, j;
     error = 0;
     uint64_t printMask = (lastCase >> 4) - 1;
@@ -1343,7 +1746,7 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double
     dchk.lim = blockCount;
     dchk.count = (blockCount + threadCount - 1) / threadCount;
 
-    for( i = 0; i < (uint64_t)lastCase; i += stride )
+    for (i = 0; i < (uint64_t)lastCase; i += stride)
     {
         count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
         fref.i = i;
@@ -1352,50 +1755,71 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double
         // Create the input and reference
         ThreadPool_Do(ReferenceF, threadCount, &fref);
 
-        error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
-        if (error) {
-            vlog_error( "Failure in clWriteArray\n" );
+        error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
+                                     count * sizeof(float), gIn_single, 0, NULL,
+                                     NULL);
+        if (error)
+        {
+            vlog_error("Failure in clWriteArray\n");
             gFailCount++;
             goto exit;
         }
 
-        if (gTestDouble) {
+        if (gTestDouble)
+        {
             ThreadPool_Do(ReferenceD, threadCount, &dref);
 
-            error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
-            if (error) {
-                vlog_error( "Failure in clWriteArray\n" );
+            error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
+                                         count * sizeof(double), gIn_double, 0,
+                                         NULL, NULL);
+            if (error)
+            {
+                vlog_error("Failure in clWriteArray\n");
                 gFailCount++;
                 goto exit;
             }
         }
 
-        for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
+        for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
+        {
             // Loop over vector legths
             fchk.vsz = g_arrVecSizes[vectorSize];
             dchk.vsz = g_arrVecSizes[vectorSize];
 
-            for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
+            for (addressSpace = 0; addressSpace < 3; addressSpace++)
+            {
                 // Loop over address spaces
                 fchk.aspace = addressSpaceNames[addressSpace];
                 dchk.aspace = addressSpaceNames[addressSpace];
 
-                cl_uint pattern = 0xdeaddead;
-                memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
+                if (!gHostReset)
+                {
+                    error = RunKernel(device, resetKernel, gInBuffer_single,
+                                      gOutBuffer_half, count, 0);
+                }
+                else
+                {
+                    cl_uint pattern = 0xdeaddead;
+                    memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
 
-                error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
-                                             0, count * sizeof(cl_half),
-                                             gOut_half, 0, NULL, NULL);
-                if (error) {
-                    vlog_error( "Failure in clWriteArray\n" );
+                    error = clEnqueueWriteBuffer(
+                        gQueue, gOutBuffer_half, CL_FALSE, 0,
+                        count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
+                }
+                if (error)
+                {
+                    vlog_error("Failure in clWriteArray\n");
                     gFailCount++;
                     goto exit;
                 }
 
-                error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
+                error = RunKernel(device, kernels[vectorSize][addressSpace],
+                                  gInBuffer_single, gOutBuffer_half,
                                   numVecs(count, vectorSize, aligned),
                                   runsOverBy(count, vectorSize, aligned));
-                if (error) {
+                if (error)
+                {
                     gFailCount++;
                     goto exit;
                 }
@@ -1403,34 +1827,51 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double
                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
                                             count * sizeof(cl_half), gOut_half,
                                             0, NULL, NULL);
-                if (error) {
-                    vlog_error( "Failure in clReadArray\n" );
+                if (error)
+                {
+                    vlog_error("Failure in clReadArray\n");
                     gFailCount++;
                     goto exit;
                 }
 
                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
-                if (error) {
-                            gFailCount++;
-                            goto exit;
-                        }
+                if (error)
+                {
+                    gFailCount++;
+                    goto exit;
+                }
 
-                if (gTestDouble) {
-                    memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
+                if (gTestDouble)
+                {
 
-                    error = clEnqueueWriteBuffer(
-                        gQueue, gOutBuffer_half, CL_FALSE, 0,
-                        count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
-                    if (error) {
-                        vlog_error( "Failure in clWriteArray\n" );
+                    if (!gHostReset)
+                    {
+                        error = RunKernel(device, resetKernel, gInBuffer_single,
+                                          gOutBuffer_half, count, 0);
+                    }
+                    else
+                    {
+                        cl_uint pattern = 0xdeaddead;
+                        memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
+
+                        error = clEnqueueWriteBuffer(
+                            gQueue, gOutBuffer_half, CL_FALSE, 0,
+                            count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
+                    }
+                    if (error)
+                    {
+                        vlog_error("Failure in clWriteArray\n");
                         gFailCount++;
                         goto exit;
                     }
 
-                    error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
+                    error = RunKernel(device,
+                                      doubleKernels[vectorSize][addressSpace],
+                                      gInBuffer_double, gOutBuffer_half,
                                       numVecs(count, vectorSize, aligned),
                                       runsOverBy(count, vectorSize, aligned));
-                    if (error) {
+                    if (error)
+                    {
                         gFailCount++;
                         goto exit;
                     }
@@ -1438,149 +1879,186 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double
                     error = clEnqueueReadBuffer(
                         gQueue, gOutBuffer_half, CL_TRUE, 0,
                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
-                    if (error) {
-                        vlog_error( "Failure in clReadArray\n" );
+                    if (error)
+                    {
+                        vlog_error("Failure in clReadArray\n");
                         gFailCount++;
                         goto exit;
                     }
 
                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
-                    if (error) {
-                                gFailCount++;
-                                goto exit;
-                            }
-                        }
+                    if (error)
+                    {
+                        gFailCount++;
+                        goto exit;
                     }
-        }  // end for vector size
+                }
+            }
+        } // end for vector size
 
-        if( ((i+blockCount) & ~printMask) == (i+blockCount) ) {
-            vlog( "." );
-            fflush( stdout );
+        if (((i + blockCount) & ~printMask) == (i + blockCount))
+        {
+            vlog(".");
+            fflush(stdout);
         }
-    }  // for end lastcase
+    } // for end lastcase
 
     loopCount = count == blockCount ? 1 : 100;
-    if( gReportTimes )
+    if (gReportTimes)
     {
-        //Init the input stream
+        // Init the input stream
         cl_float *p = (cl_float *)gIn_single;
-        for( j = 0; j < count; j++ )
-            p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
+        for (j = 0; j < count; j++)
+            p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
 
-        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
+                                          count * sizeof(float), gIn_single, 0,
+                                          NULL, NULL)))
         {
-            vlog_error( "Failure in clWriteArray\n" );
+            vlog_error("Failure in clWriteArray\n");
             gFailCount++;
             goto exit;
         }
 
-        if( gTestDouble )
+        if (gTestDouble)
         {
-            //Init the input stream
+            // Init the input stream
             cl_double *q = (cl_double *)gIn_double;
-            for( j = 0; j < count; j++ )
-                q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
+            for (j = 0; j < count; j++)
+                q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
 
-            if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
+            if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
+                                              0, count * sizeof(double),
+                                              gIn_double, 0, NULL, NULL)))
             {
-                vlog_error( "Failure in clWriteArray\n" );
+                vlog_error("Failure in clWriteArray\n");
                 gFailCount++;
                 goto exit;
             }
         }
 
-        //Run again for timing
-        for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+        // Run again for timing
+        for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
         {
             uint64_t bestTime = -1ULL;
-            for( j = 0; j < loopCount; j++ )
+            for (j = 0; j < loopCount; j++)
             {
                 uint64_t startTime = ReadTime();
-                if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
-                                       runsOverBy(count, vectorSize, aligned)) ) )
+                if ((error = RunKernel(device, kernels[vectorSize][0],
+                                       gInBuffer_single, gOutBuffer_half,
+                                       numVecs(count, vectorSize, aligned),
+                                       runsOverBy(count, vectorSize, aligned))))
                 {
                     gFailCount++;
                     goto exit;
                 }
 
-                if( (error = clFinish(gQueue)) )
+                if ((error = clFinish(gQueue)))
                 {
-                    vlog_error( "Failure in clFinish\n" );
+                    vlog_error("Failure in clFinish\n");
                     gFailCount++;
                     goto exit;
                 }
                 uint64_t currentTime = ReadTime() - startTime;
-                if( currentTime < bestTime )
-                    bestTime = currentTime;
-                time[ vectorSize ] += currentTime;
+                if (currentTime < bestTime) bestTime = currentTime;
+                time[vectorSize] += currentTime;
             }
-            if( bestTime < min_time[ vectorSize ] )
-                min_time[ vectorSize ] = bestTime ;
+            if (bestTime < min_time[vectorSize])
+                min_time[vectorSize] = bestTime;
 
-            if( gTestDouble )
+            if (gTestDouble)
             {
                 bestTime = -1ULL;
-                for( j = 0; j < loopCount; j++ )
+                for (j = 0; j < loopCount; j++)
                 {
                     uint64_t startTime = ReadTime();
-                    if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
-                                           runsOverBy(count, vectorSize, aligned)) ) )
+                    if ((error =
+                             RunKernel(device, doubleKernels[vectorSize][0],
+                                       gInBuffer_double, gOutBuffer_half,
+                                       numVecs(count, vectorSize, aligned),
+                                       runsOverBy(count, vectorSize, aligned))))
                     {
                         gFailCount++;
                         goto exit;
                     }
 
-                    if( (error = clFinish(gQueue)) )
+                    if ((error = clFinish(gQueue)))
                     {
-                        vlog_error( "Failure in clFinish\n" );
+                        vlog_error("Failure in clFinish\n");
                         gFailCount++;
                         goto exit;
                     }
                     uint64_t currentTime = ReadTime() - startTime;
-                    if( currentTime < bestTime )
-                        bestTime = currentTime;
-                    doubleTime[ vectorSize ] += currentTime;
+                    if (currentTime < bestTime) bestTime = currentTime;
+                    doubleTime[vectorSize] += currentTime;
                 }
-                if( bestTime < min_double_time[ vectorSize ] )
-                    min_double_time[ vectorSize ] = bestTime;
+                if (bestTime < min_double_time[vectorSize])
+                    min_double_time[vectorSize] = bestTime;
             }
         }
     }
 
-    if( gReportTimes )
+    if (gReportTimes)
     {
-        for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-            vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
-                      "average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
-        for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-            vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
-                      "best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
-        if( gTestDouble )
+        for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
+            vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
+                          * gComputeDevices / (double)(count * loopCount),
+                      0, "average us/elem",
+                      "vStoreaHalf%s avg. (%s vector size: %d)", roundName,
+                      addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
+        for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+             vectorSize++)
+            vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
+                          * gDeviceFrequency * gComputeDevices / (double)count,
+                      0, "best us/elem",
+                      "vStoreaHalf%s best (%s vector size: %d)", roundName,
+                      addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
+        if (gTestDouble)
         {
-            for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-                vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
-                          "average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
-            for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
-                vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
-                          "best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
+            for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+                 vectorSize++)
+                vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
+                              * gDeviceFrequency * gComputeDevices
+                              / (double)(count * loopCount),
+                          0, "average us/elem (double)",
+                          "vStoreaHalf%s avg. d (%s vector size: %d)",
+                          roundName, addressSpaceNames[0],
+                          (g_arrVecSizes[vectorSize]));
+            for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+                 vectorSize++)
+                vlog_perf(
+                    SubtractTime(min_double_time[vectorSize], 0) * 1e6
+                        * gDeviceFrequency * gComputeDevices / (double)count,
+                    0, "best us/elem (double)",
+                    "vStoreaHalf%s best d (%s vector size: %d)", roundName,
+                    addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
         }
     }
 
 exit:
-    //clean up
-    for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
+    // clean up
+    if (!gHostReset)
     {
-        for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
-            clReleaseKernel( kernels[ vectorSize ][addressSpace] );
-            clReleaseProgram( programs[ vectorSize ][addressSpace] );
-            if( gTestDouble )
+        clReleaseKernel(resetKernel);
+        clReleaseProgram(resetProgram);
+    }
+
+    for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
+         vectorSize++)
+    {
+        for (addressSpace = 0; addressSpace < 3; addressSpace++)
+        {
+            clReleaseKernel(kernels[vectorSize][addressSpace]);
+            clReleaseProgram(programs[vectorSize][addressSpace]);
+            if (gTestDouble)
             {
-                clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
-                clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
+                clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
+                clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
             }
         }
     }
 
     return error;
 }
-
diff --git a/test_conformance/half/cl_utils.cpp b/test_conformance/half/cl_utils.cpp
index 68f7b9cd..04770210 100644
--- a/test_conformance/half/cl_utils.cpp
+++ b/test_conformance/half/cl_utils.cpp
@@ -35,37 +35,38 @@ const char *align_divisors[kVectorSizeCount+kStrangeVectorSizeCount] = { "1", "2
 const char *align_types[kVectorSizeCount+kStrangeVectorSizeCount] = { "half", "int", "int2", "int4", "int8", "int2" };
 
 
-void            *gIn_half = NULL;
-void            *gOut_half = NULL;
-void            *gOut_half_reference = NULL;
-void            *gOut_half_reference_double = NULL;
-void            *gIn_single = NULL;
-void            *gOut_single = NULL;
-void            *gOut_single_reference = NULL;
-void            *gIn_double = NULL;
-// void            *gOut_double = NULL;
-// void            *gOut_double_reference = NULL;
-cl_mem          gInBuffer_half = NULL;
-cl_mem          gOutBuffer_half = NULL;
-cl_mem          gInBuffer_single = NULL;
-cl_mem          gOutBuffer_single = NULL;
-cl_mem          gInBuffer_double = NULL;
-// cl_mem          gOutBuffer_double = NULL;
+void *gIn_half = NULL;
+void *gOut_half = NULL;
+void *gOut_half_reference = NULL;
+void *gOut_half_reference_double = NULL;
+void *gIn_single = NULL;
+void *gOut_single = NULL;
+void *gOut_single_reference = NULL;
+void *gIn_double = NULL;
+// void *gOut_double = NULL;
+// void *gOut_double_reference = NULL;
+cl_mem gInBuffer_half = NULL;
+cl_mem gOutBuffer_half = NULL;
+cl_mem gInBuffer_single = NULL;
+cl_mem gOutBuffer_single = NULL;
+cl_mem gInBuffer_double = NULL;
+// cl_mem gOutBuffer_double = NULL;
 
-cl_context       gContext = NULL;
+cl_context gContext = NULL;
 cl_command_queue gQueue = NULL;
-uint32_t        gDeviceFrequency = 0;
-uint32_t        gComputeDevices = 0;
-size_t          gMaxThreadGroupSize = 0;
-size_t          gWorkGroupSize = 0;
-bool            gWimpyMode = false;
-int             gWimpyReductionFactor = 512;
-int             gTestDouble = 0;
+uint32_t gDeviceFrequency = 0;
+uint32_t gComputeDevices = 0;
+size_t gMaxThreadGroupSize = 0;
+size_t gWorkGroupSize = 0;
+bool gWimpyMode = false;
+int gWimpyReductionFactor = 512;
+int gTestDouble = 0;
+bool gHostReset = false;
 
 #if defined( __APPLE__ )
-int             gReportTimes = 1;
+int gReportTimes = 1;
 #else
-int             gReportTimes = 0;
+int gReportTimes = 0;
 #endif
 
 #pragma mark -
diff --git a/test_conformance/half/cl_utils.h b/test_conformance/half/cl_utils.h
index 50d8af3d..da6073cf 100644
--- a/test_conformance/half/cl_utils.h
+++ b/test_conformance/half/cl_utils.h
@@ -44,37 +44,38 @@
 #include <CL/opencl.h>
 #endif
 
-extern void            *gIn_half;
-extern void            *gOut_half;
-extern void            *gOut_half_reference;
-extern void            *gOut_half_reference_double;
-extern void            *gIn_single;
-extern void            *gOut_single;
-extern void            *gOut_single_reference;
-extern void            *gIn_double;
-// extern void            *gOut_double;
-// extern void            *gOut_double_reference;
-extern cl_mem          gInBuffer_half;
-extern cl_mem          gOutBuffer_half;
-extern cl_mem          gInBuffer_single;
-extern cl_mem          gOutBuffer_single;
-extern cl_mem          gInBuffer_double;
-// extern cl_mem          gOutBuffer_double;
+extern void *gIn_half;
+extern void *gOut_half;
+extern void *gOut_half_reference;
+extern void *gOut_half_reference_double;
+extern void *gIn_single;
+extern void *gOut_single;
+extern void *gOut_single_reference;
+extern void *gIn_double;
+// extern void *gOut_double;
+// extern void *gOut_double_reference;
+extern cl_mem gInBuffer_half;
+extern cl_mem gOutBuffer_half;
+extern cl_mem gInBuffer_single;
+extern cl_mem gOutBuffer_single;
+extern cl_mem gInBuffer_double;
+// extern cl_mem gOutBuffer_double;
 
-extern cl_context      gContext;
+extern cl_context gContext;
 extern cl_command_queue gQueue;
-extern uint32_t        gDeviceFrequency;
-extern uint32_t        gComputeDevices;
-extern size_t          gMaxThreadGroupSize;
-extern size_t          gWorkGroupSize;
-extern int             gTestDouble;
-extern int             gReportTimes;
+extern uint32_t gDeviceFrequency;
+extern uint32_t gComputeDevices;
+extern size_t gMaxThreadGroupSize;
+extern size_t gWorkGroupSize;
+extern int gTestDouble;
+extern int gReportTimes;
+extern bool gHostReset;
 
 // gWimpyMode indicates if we run the test in wimpy mode where we limit the
 // size of 32 bit ranges to a much smaller set.  This is meant to be used
 // as a smoke test
-extern bool            gWimpyMode;
-extern int             gWimpyReductionFactor;
+extern bool gWimpyMode;
+extern int gWimpyReductionFactor;
 
 uint64_t ReadTime( void );
 double SubtractTime( uint64_t endTime, uint64_t startTime );
diff --git a/test_conformance/half/main.cpp b/test_conformance/half/main.cpp
index 6bc7db95..ee44fb2d 100644
--- a/test_conformance/half/main.cpp
+++ b/test_conformance/half/main.cpp
@@ -194,6 +194,8 @@ static int ParseArgs( int argc, const char **argv )
                         gReportTimes ^= 1;
                         break;
 
+                    case 'r': gHostReset = true; break;
+
                     case 'w':  // Wimpy mode
                         gWimpyMode = true;
                         break;
@@ -235,13 +237,17 @@ static int ParseArgs( int argc, const char **argv )
 
 static void PrintUsage( void )
 {
-    vlog( "%s [-dthw]: <optional: test names>\n", appName );
-    vlog( "\t\t-d\tToggle double precision testing (default: on if double supported)\n" );
-    vlog( "\t\t-t\tToggle reporting performance data.\n" );
-    vlog( "\t\t-w\tRun in wimpy mode\n" );
-    vlog( "\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
-    vlog( "\t\t-h\tHelp\n" );
-    for( int i = 0; i < test_num; i++ )
+    vlog("%s [-dthw]: <optional: test names>\n", appName);
+    vlog("\t\t-d\tToggle double precision testing (default: on if double "
+         "supported)\n");
+    vlog("\t\t-t\tToggle reporting performance data.\n");
+    vlog("\t\t-r\tReset buffers on host instead of on device.\n");
+    vlog("\t\t-w\tRun in wimpy mode\n");
+    vlog("\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
+         "1-12, default factor(%u)\n",
+         gWimpyReductionFactor);
+    vlog("\t\t-h\tHelp\n");
+    for (int i = 0; i < test_num; i++)
     {
         vlog("\t\t%s\n", test_list[i].name );
     }