OpenCL-CTS/test_conformance/half/Test_vStoreHalf.c

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "../../test_common/harness/compat.h"
#include "../../test_common/harness/kernelHelpers.h"

#include <string.h>
#include "cl_utils.h"
#include "tests.h"

extern const char *addressSpaceNames[];

typedef struct ComputeReferenceInfoF_
{
    float *x;
    cl_ushort *r;
    f2h f;
    cl_ulong i;
    cl_uint lim;
    cl_uint count;
} ComputeReferenceInfoF;

typedef struct ComputeReferenceInfoD_
{
    double *x;
    cl_ushort *r;
    d2h f;
    cl_ulong i;
    cl_uint lim;
    cl_uint count;
} ComputeReferenceInfoD;

typedef struct CheckResultInfoF_
{
    const float *x;
    const cl_ushort *r;
    const cl_ushort *s;
    f2h f;
    const char *aspace;
    cl_uint lim;
    cl_uint count;
    int vsz;
} CheckResultInfoF;

typedef struct CheckResultInfoD_
{
    const double *x;
    const cl_ushort *r;
    const cl_ushort *s;
    d2h f;
    const char *aspace;
    cl_uint lim;
    cl_uint count;
    int vsz;
} CheckResultInfoD;

static cl_int
ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
{
    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
    cl_uint lim = cri->lim;
    cl_uint count = cri->count;
    cl_uint off = jid * count;
    float *x = cri->x + off;
    cl_ushort *r = cri->r + off;
    f2h f = cri->f;
    cl_ulong i = cri->i + off;
    cl_uint j, rr;

    if (off + count > lim)
        count = lim - off;

    for (j = 0; j < count; ++j) {
        x[j] = as_float((cl_uint)(i + j));
        r[j] = f(x[j]);
    }

    return 0;
}

static cl_int
CheckF(cl_uint jid, cl_uint tid, void *userInfo)
{
    CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
    cl_uint lim = cri->lim;
    cl_uint count = cri->count;
    cl_uint off = jid * count;
    const float *x = cri->x + off;
    const cl_ushort *r = cri->r + off;
    const cl_ushort *s = cri->s + off;
    f2h f = cri->f;
    cl_uint j;
    cl_ushort correct2 = f( 0.0f);
    cl_ushort correct3 = f(-0.0f);
    cl_int ret = 0;

    if (off + count > lim)
        count = lim - off;

    if (!memcmp(r, s, count*sizeof(cl_ushort)))
        return 0;

    for (j = 0; j < count; j++) {
    if (s[j] == r[j])
        continue;

        // Pass any NaNs
        if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
            continue;

        // retry per section 6.5.3.3
        if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
            continue;

        // if reference result is subnormal, pass any zero
        if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
            continue;

        vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x,  vector_size = %d, address_space = %s\n",
                   j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);

        ret = 1;
        break;
    }

    return ret;
}

static cl_int
ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
{
    ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
    cl_uint lim = cri->lim;
    cl_uint count = cri->count;
    cl_uint off = jid * count;
    double *x = cri->x + off;
    cl_ushort *r = cri->r + off;
    d2h f = cri->f;
    cl_uint j;
    cl_ulong i = cri->i + off;

    if (off + count > lim)
        count = lim - off;

    for (j = 0; j < count; ++j) {
        x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
        r[j] = f(x[j]);
    }

    return 0;
}

static cl_int
CheckD(cl_uint jid, cl_uint tid, void *userInfo)
{
    CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
    cl_uint lim = cri->lim;
    cl_uint count = cri->count;
    cl_uint off = jid * count;
    const double *x = cri->x + off;
    const cl_ushort *r = cri->r + off;
    const cl_ushort *s = cri->s + off;
    d2h f = cri->f;
    cl_uint j;
    cl_ushort correct2 = f( 0.0);
    cl_ushort correct3 = f(-0.0);
    cl_int ret = 0;

    if (off + count > lim)
        count = lim - off;

    if (!memcmp(r, s, count*sizeof(cl_ushort)))
        return 0;

    for (j = 0; j < count; j++) {
    if (s[j] == r[j])
        continue;

        // Pass any NaNs
        if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
            continue;

        if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
            continue;

        // if reference result is subnormal, pass any zero result
        if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
            continue;

        vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n",
                   j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);

        ret = 1;
    break;
    }

    return ret;
}

static cl_ushort float2half_rte( float f );
static cl_ushort float2half_rtz( float f );
static cl_ushort float2half_rtp( float f );
static cl_ushort float2half_rtn( float f );
static cl_ushort double2half_rte( double f );
static cl_ushort double2half_rtz( double f );
static cl_ushort double2half_rtp( double f );
static cl_ushort double2half_rtn( double f );

static cl_ushort
float2half_rte( float f )
{
    union{ float f; cl_uint u; } u = {f};
    cl_uint sign = (u.u >> 16) & 0x8000;
    float x = fabsf(f);

    //Nan
    if( x != x )
    {
        u.u >>= (24-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    // overflow
    if( x >= MAKE_HEX_FLOAT(0x1.ffep15f, 0x1ffeL, 3) )
        return 0x7c00 | sign;

    // underflow
    if( x <= MAKE_HEX_FLOAT(0x1.0p-25f, 0x1L, -25) )
        return sign;    // The halfway case can return 0x0001 or 0. 0 is even.

    // very small
    if( x < MAKE_HEX_FLOAT(0x1.8p-24f, 0x18L, -28) )
        return sign | 1;

    // half denormal
    if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
    {
        u.f = x * MAKE_HEX_FLOAT(0x1.0p-125f, 0x1L, -125);
        return sign | u.u;
    }

    u.f *= MAKE_HEX_FLOAT(0x1.0p13f, 0x1L, 13);
    u.u &= 0x7f800000;
    x += u.f;
    u.f = x - u.f;
    u.f *= MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);

    return (u.u >> (24-11)) | sign;
}

static cl_ushort
float2half_rtz( float f )
{
    union{ float f; cl_uint u; } u = {f};
    cl_uint sign = (u.u >> 16) & 0x8000;
    float x = fabsf(f);

    //Nan
    if( x != x )
    {
        u.u >>= (24-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    // overflow
    if( x >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) )
    {
        if( x == INFINITY )
            return 0x7c00 | sign;

        return 0x7bff | sign;
    }

    // underflow
    if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
        return sign;    // The halfway case can return 0x0001 or 0. 0 is even.

    // half denormal
    if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
    {
        x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
        return (cl_ushort)((int) x | sign);
    }

    u.u &= 0xFFFFE000U;
    u.u -= 0x38000000U;

    return (u.u >> (24-11)) | sign;
}

static cl_ushort
float2half_rtp( float f )
{
    union{ float f; cl_uint u; } u = {f};
    cl_uint sign = (u.u >> 16) & 0x8000;
    float x = fabsf(f);

    //Nan
    if( x != x )
    {
        u.u >>= (24-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    // overflow
    if( f > MAKE_HEX_FLOAT(0x1.ffcp15f, 0x1ffcL, 3) )
        return 0x7c00;

    if( f <= MAKE_HEX_FLOAT(-0x1.0p16f, -0x1L, 16) )
    {
        if( f == -INFINITY )
            return 0xfc00;

        return 0xfbff;
    }

    // underflow
    if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
    {
        if( f > 0 )
            return 1;
        return sign;
    }

    // half denormal
    if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
    {
        x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
        int r = (int) x;
        r += (float) r != x && f > 0.0f;

        return (cl_ushort)( r | sign);
    }

    float g = u.f;
    u.u &= 0xFFFFE000U;
    if( g > u.f )
        u.u += 0x00002000U;
    u.u -= 0x38000000U;

    return (u.u >> (24-11)) | sign;
}


static cl_ushort
float2half_rtn( float f )
{
    union{ float f; cl_uint u; } u = {f};
    cl_uint sign = (u.u >> 16) & 0x8000;
    float x = fabsf(f);

    //Nan
    if( x != x )
    {
        u.u >>= (24-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    // overflow
    if( f >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) )
    {
        if( f == INFINITY )
            return 0x7c00;

        return 0x7bff;
    }

    if( f < MAKE_HEX_FLOAT(-0x1.ffcp15f, -0x1ffcL, 3) )
        return 0xfc00;

    // underflow
    if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
    {
        if( f < 0 )
            return 0x8001;
        return sign;
    }

    // half denormal
    if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
    {
        x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
        int r = (int) x;
        r += (float) r != x && f < 0.0f;

        return (cl_ushort)( r | sign);
    }

    u.u &= 0xFFFFE000U;
    if( u.f > f )
        u.u += 0x00002000U;
    u.u -= 0x38000000U;

    return (u.u >> (24-11)) | sign;
}

static cl_ushort
double2half_rte( double f )
{
    union{ double f; cl_ulong u; } u = {f};
    cl_ulong sign = (u.u >> 48) & 0x8000;
    double x = fabs(f);

    //Nan
    if( x != x )
    {
        u.u >>= (53-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    // overflow
    if( x >= MAKE_HEX_DOUBLE(0x1.ffep15, 0x1ffeLL, 3) )
        return 0x7c00 | sign;

    // underflow
    if( x <= MAKE_HEX_DOUBLE(0x1.0p-25, 0x1LL, -25) )
        return sign;    // The halfway case can return 0x0001 or 0. 0 is even.

    // very small
    if( x < MAKE_HEX_DOUBLE(0x1.8p-24, 0x18LL, -28) )
        return sign | 1;

    // half denormal
    if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
    {
        u.f = x * MAKE_HEX_DOUBLE(0x1.0p-1050, 0x1LL, -1050);
        return sign | u.u;
    }

    u.f *= MAKE_HEX_DOUBLE(0x1.0p42, 0x1LL, 42);
    u.u &= 0x7ff0000000000000ULL;
    x += u.f;
    u.f = x - u.f;
    u.f *= MAKE_HEX_DOUBLE(0x1.0p-1008, 0x1LL, -1008);

    return (u.u >> (53-11)) | sign;
}

static cl_ushort
double2half_rtz( double f )
{
    union{ double f; cl_ulong u; } u = {f};
    cl_ulong sign = (u.u >> 48) & 0x8000;
    double x = fabs(f);

    //Nan
    if( x != x )
    {
        u.u >>= (53-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    if( x == INFINITY )
        return 0x7c00 | sign;

    // overflow
    if( x >= MAKE_HEX_DOUBLE(0x1.0p16, 0x1LL, 16) )
        return 0x7bff | sign;

    // underflow
    if( x < MAKE_HEX_DOUBLE(0x1.0p-24, 0x1LL, -24) )
        return sign;    // The halfway case can return 0x0001 or 0. 0 is even.

    // half denormal
    if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
    {
        x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
        return (cl_ushort)((int) x | sign);
    }

    u.u &= 0xFFFFFC0000000000ULL;
    u.u -= 0x3F00000000000000ULL;

    return (u.u >> (53-11)) | sign;
}

static cl_ushort
double2half_rtp( double f )
{
    union{ double f; cl_ulong u; } u = {f};
    cl_ulong sign = (u.u >> 48) & 0x8000;
    double x = fabs(f);

    //Nan
    if( x != x )
    {
        u.u >>= (53-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    // overflow
    if( f > MAKE_HEX_DOUBLE(0x1.ffcp15, 0x1ffcLL, 3) )
        return 0x7c00;

    if( f <= MAKE_HEX_DOUBLE(-0x1.0p16, -0x1LL, 16) )
    {
        if( f == -INFINITY )
            return 0xfc00;

        return 0xfbff;
    }

    // underflow
    if( x < MAKE_HEX_DOUBLE(0x1.0p-24, 0x1LL, -24) )
    {
        if( f > 0 )
            return 1;
        return sign;
    }

    // half denormal
    if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
    {
        x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
        int r = (int) x;
        if( 0 == sign )
            r += (double) r != x;

        return (cl_ushort)( r | sign);
    }

    double g = u.f;
    u.u &= 0xFFFFFC0000000000ULL;
    if( g != u.f && 0 == sign)
        u.u += 0x0000040000000000ULL;
    u.u -= 0x3F00000000000000ULL;

    return (u.u >> (53-11)) | sign;
}


static cl_ushort
double2half_rtn( double f )
{
    union{ double f; cl_ulong u; } u = {f};
    cl_ulong sign = (u.u >> 48) & 0x8000;
    double x = fabs(f);

    //Nan
    if( x != x )
    {
        u.u >>= (53-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
    }

    // overflow
    if( f >= MAKE_HEX_DOUBLE(0x1.0p16, 0x1LL, 16) )
    {
        if( f == INFINITY )
            return 0x7c00;

        return 0x7bff;
    }

    if( f < MAKE_HEX_DOUBLE(-0x1.ffcp15, -0x1ffcLL, 3) )
        return 0xfc00;

    // underflow
    if( x < MAKE_HEX_DOUBLE(0x1.0p-24, 0x1LL, -24) )
    {
        if( f < 0 )
            return 0x8001;
        return sign;
    }

    // half denormal
    if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
    {
        x *= MAKE_HEX_DOUBLE(0x1.0p24, 0x1LL, 24);
        int r = (int) x;
        if( sign )
            r += (double) r != x;

        return (cl_ushort)( r | sign);
    }

    double g = u.f;
    u.u &= 0xFFFFFC0000000000ULL;
    if( g < u.f && sign)
        u.u += 0x0000040000000000ULL;
    u.u -= 0x3F00000000000000ULL;

    return (u.u >> (53-11)) | sign;
}

int Test_vstore_half( void )
{
    switch (get_default_rounding_mode(gDevice))
    {
        case CL_FP_ROUND_TO_ZERO:
            return Test_vStoreHalf_private(float2half_rtz, double2half_rte, "");
        case 0:
            return -1;
        default:
            return Test_vStoreHalf_private(float2half_rte, double2half_rte, "");
    }
}

int Test_vstore_half_rte( void )
{
    return Test_vStoreHalf_private(float2half_rte, double2half_rte, "_rte");
}

int Test_vstore_half_rtz( void )
{
    return Test_vStoreHalf_private(float2half_rtz, double2half_rtz, "_rtz");
}

int Test_vstore_half_rtp( void )
{
    return Test_vStoreHalf_private(float2half_rtp, double2half_rtp, "_rtp");
}

int Test_vstore_half_rtn( void )
{
    return Test_vStoreHalf_private(float2half_rtn, double2half_rtn, "_rtn");
}

int Test_vstorea_half( void )
{
    switch (get_default_rounding_mode(gDevice))
    {
        case CL_FP_ROUND_TO_ZERO:
            return Test_vStoreaHalf_private(float2half_rtz, double2half_rte, "");
        case 0:
            return -1;
        default:
            return Test_vStoreaHalf_private(float2half_rte, double2half_rte, "");
    }
}

int Test_vstorea_half_rte( void )
{
    return Test_vStoreaHalf_private(float2half_rte, double2half_rte, "_rte");
}

int Test_vstorea_half_rtz( void )
{
    return Test_vStoreaHalf_private(float2half_rtz, double2half_rtz, "_rtz");
}

int Test_vstorea_half_rtp( void )
{
    return Test_vStoreaHalf_private(float2half_rtp, double2half_rtp, "_rtp");
}

int Test_vstorea_half_rtn( void )
{
    return Test_vStoreaHalf_private(float2half_rtn, double2half_rtn, "_rtn");
}

#pragma mark -

int Test_vStoreHalf_private( f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
{
    int vectorSize, error;
    cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
    cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];

    uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    memset( min_time, -1, sizeof( min_time ) );
    cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
    cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
    uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    memset( min_double_time, -1, sizeof( min_double_time ) );

    vlog( "Testing vstore_half%s\n", roundName );
    fflush( stdout );

    bool aligned= false;

    for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
    {
        const char *source[] = {
            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
            "}\n"
        };

        const char *source_v3[] = {
            "__kernel void test( __global float *p, __global half *f,\n"
            "                   uint extra_last_thread)\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t last_i = get_global_size(0)-1;\n"
            "   size_t adjust = 0;\n"
            "   if(last_i == i && extra_last_thread != 0) {\n"
            "     adjust = 3-extra_last_thread;\n"
            "   } "
            "   vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n"
            "}\n"
        };

        const char *source_private_store[] = {
            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __private ushort data[16];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t offset = 0;\n"
            "   size_t vecsize = vec_step(p[i]);\n"
            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
            "   for(offset = 0; offset < vecsize; offset++)\n"
            "   {\n"
            "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
            "   }\n"
            "}\n"
        };


        const char *source_private_store_v3[] = {
            "__kernel void test( __global float *p, __global half *f,\n"
            "                   uint extra_last_thread )\n"
            "{\n"
            "   __private ushort data[4];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t last_i = get_global_size(0)-1;\n"
            "   size_t adjust = 0;\n"
            "   size_t offset = 0;\n"
            "   if(last_i == i && extra_last_thread != 0) {\n"
            "     adjust = 3-extra_last_thread;\n"
            "   } "
            "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
            "   for(offset = 0; offset < 3; offset++)\n"
            "   {\n"
            "       vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n"
            "   }\n"
            "}\n"
        };

        char local_buf_size[10];
        sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);


        const char *source_local_store[] = {
            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __local ushort data[16*", local_buf_size, "];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   size_t lsize = get_local_size(0);\n"
            "   size_t vecsize = vec_step(p[0]);\n"
            "   event_t async_event;\n"
            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
            "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
            "   async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
            "   wait_group_events(1, &async_event);\n"
            "}\n"
        };

        const char *source_local_store_v3[] = {
            "__kernel void test( __global float *p, __global half *f,\n"
            "                   uint extra_last_thread )\n"
            "{\n"
            "   __local ushort data[3*(", local_buf_size, "+1)];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   size_t last_i = get_global_size(0)-1;\n"
            "   size_t adjust = 0;\n"
            "   size_t lsize = get_local_size(0);\n"
            "   event_t async_event;\n"
            "   if(last_i == i && extra_last_thread != 0) {\n"
            "     adjust = 3-extra_last_thread;\n"
            "   } "
            "   vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
            "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
            "   async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
            "   wait_group_events(1, &async_event);\n"
            "}\n"
        };

        const char *double_source[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
            "}\n"
        };

        const char *double_source_private_store[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __private ushort data[16];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t offset = 0;\n"
            "   size_t vecsize = vec_step(p[i]);\n"
            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
            "   for(offset = 0; offset < vecsize; offset++)\n"
            "   {\n"
            "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
            "   }\n"
            "}\n"
        };


        const char *double_source_local_store[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __local ushort data[16*", local_buf_size, "];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   size_t vecsize = vec_step(p[0]);\n"
            "   size_t lsize = get_local_size(0);\n"
            "   event_t async_event;\n"
            "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
            "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
            "   async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
            "   wait_group_events(1, &async_event);\n"
            "}\n"
        };


        const char *double_source_v3[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double *p, __global half *f ,\n"
            "                   uint extra_last_thread)\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t last_i = get_global_size(0)-1;\n"
            "   size_t adjust = 0;\n"
            "   if(last_i == i && extra_last_thread != 0) {\n"
            "     adjust = 3-extra_last_thread;\n"
            "   } "
            "   vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n"
            "}\n"
        };

        const char *double_source_private_store_v3[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double *p, __global half *f,\n"
            "                   uint extra_last_thread )\n"
            "{\n"
            "   __private ushort data[4];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t last_i = get_global_size(0)-1;\n"
            "   size_t adjust = 0;\n"
            "   size_t offset = 0;\n"
            "   if(last_i == i && extra_last_thread != 0) {\n"
            "     adjust = 3-extra_last_thread;\n"
            "   } "
            "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
            "   for(offset = 0; offset < 3; offset++)\n"
            "   {\n"
            "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n"
            "   }\n"
            "}\n"
        };

        const char *double_source_local_store_v3[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double *p, __global half *f,\n"
            "                   uint extra_last_thread )\n"
            "{\n"
            "   __local ushort data[3*(", local_buf_size, "+1)];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   size_t last_i = get_global_size(0)-1;\n"
            "   size_t adjust = 0;\n"
            "   size_t lsize = get_local_size(0);\n"
            "   event_t async_event;\n"
            "   if(last_i == i && extra_last_thread != 0) {\n"
            "     adjust = 3-extra_last_thread;\n"
            "   }\n "
            "   vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
            "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
            "   async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
            "   wait_group_events(1, &async_event);\n"
            "}\n"
        };


        if(g_arrVecSizes[vectorSize] == 3) {
            programs[vectorSize][0] = MakeProgram( source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
        } else {
            programs[vectorSize][0] = MakeProgram( source, sizeof(source) / sizeof( source[0]) );
        }
        if( NULL == programs[ vectorSize ][0] )
        {
            gFailCount++;
            return -1;
        }

        kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
        if( NULL == kernels[vectorSize][0] )
        {
            gFailCount++;
            vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
            return error;
        }

        if(g_arrVecSizes[vectorSize] == 3) {
            programs[vectorSize][1] = MakeProgram( source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) );
        } else {
            programs[vectorSize][1] = MakeProgram( source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) );
        }
        if( NULL == programs[ vectorSize ][1] )
        {
            gFailCount++;
            return -1;
        }

        kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
        if( NULL == kernels[vectorSize][1] )
        {
            gFailCount++;
            vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
            return error;
        }

        if(g_arrVecSizes[vectorSize] == 3) {
            programs[vectorSize][2] = MakeProgram( source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) );
            if(  NULL == programs[ vectorSize ][2] )
            {
                unsigned q;
                for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++)
                    vlog_error("%s", source_local_store_v3[q]);

                gFailCount++;
                return -1;

            }
        } else {
            programs[vectorSize][2] = MakeProgram( source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) );
            if( NULL == programs[ vectorSize ][2] )
            {
                unsigned q;
                for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++)
                    vlog_error("%s", source_local_store[q]);

                gFailCount++;
                return -1;

            }
        }

        kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
        if( NULL == kernels[vectorSize][2] )
        {
            gFailCount++;
            vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
            return error;
        }

        if( gTestDouble )
        {
            if(g_arrVecSizes[vectorSize] == 3) {
                doublePrograms[vectorSize][0] = MakeProgram( double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
            } else {
                doublePrograms[vectorSize][0] = MakeProgram( double_source, sizeof(double_source) / sizeof( double_source[0]) );
            }
            if( NULL == doublePrograms[ vectorSize ][0] )
            {
                gFailCount++;
                return -1;
            }

            doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
            if( NULL == kernels[vectorSize][0] )
            {
                gFailCount++;
                vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
                return error;
            }

            if(g_arrVecSizes[vectorSize] == 3)
                doublePrograms[vectorSize][1] = MakeProgram( double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) );
            else
                doublePrograms[vectorSize][1] = MakeProgram( double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) );

            if( NULL == doublePrograms[ vectorSize ][1] )
            {
                gFailCount++;
                return -1;
            }

            doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
            if( NULL == kernels[vectorSize][1] )
            {
                gFailCount++;
                vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
                return error;
            }

            if(g_arrVecSizes[vectorSize] == 3) {
                doublePrograms[vectorSize][2] = MakeProgram( double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) );
            } else {
                doublePrograms[vectorSize][2] = MakeProgram( double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) );
            }
            if( NULL == doublePrograms[ vectorSize ][2] )
            {
                gFailCount++;
                return -1;
            }

            doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
            if( NULL == kernels[vectorSize][2] )
            {
                gFailCount++;
                vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
                return error;
            }
        }
    } // end for vector size

    // Figure out how many elements are in a work block
    size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
    size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
    uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
    size_t stride = blockCount;

    if (gWimpyMode)
        stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;

    // we handle 64-bit types a bit differently.
    if( lastCase == 0 )
        lastCase = 0x100000000ULL;

    uint64_t i, j;
    error = 0;
    uint64_t printMask = (lastCase >> 4) - 1;
    cl_uint count = 0;
    int addressSpace;
    size_t loopCount;
    cl_uint threadCount = GetThreadCount();

    ComputeReferenceInfoF fref;
    fref.x = (float *)gIn_single;
    fref.r = (cl_ushort *)gOut_half_reference;
    fref.f = referenceFunc;
    fref.lim = blockCount;
    fref.count = (blockCount + threadCount - 1) / threadCount;

    CheckResultInfoF fchk;
    fchk.x = (const float *)gIn_single;
    fchk.r = (const cl_ushort *)gOut_half_reference;
    fchk.s = (const cl_ushort *)gOut_half;
    fchk.f = referenceFunc;
    fchk.lim = blockCount;
    fchk.count = (blockCount + threadCount - 1) / threadCount;

    ComputeReferenceInfoD dref;
    dref.x = (double *)gIn_double;
    dref.r = (cl_ushort *)gOut_half_reference_double;
    dref.f = doubleReferenceFunc;
    dref.lim = blockCount;
    dref.count = (blockCount + threadCount - 1) / threadCount;

    CheckResultInfoD dchk;
    dchk.x = (const double *)gIn_double;
    dchk.r = (const cl_ushort *)gOut_half_reference_double;
    dchk.s = (const cl_ushort *)gOut_half;
    dchk.f = doubleReferenceFunc;
    dchk.lim = blockCount;
    dchk.count = (blockCount + threadCount - 1) / threadCount;

    for( i = 0; i < lastCase; i += stride )
    {
        count = (cl_uint) MIN( blockCount, lastCase - i );
        fref.i = i;
        dref.i = i;

        // Compute the input and reference
        ThreadPool_Do(ReferenceF, threadCount, &fref);

        error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
        if (error) {
            vlog_error( "Failure in clWriteBuffer\n" );
            gFailCount++;
            goto exit;
        }

        if (gTestDouble) {
            ThreadPool_Do(ReferenceD, threadCount, &dref);

            error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
            if (error) {
                vlog_error( "Failure in clWriteBuffer\n" );
                gFailCount++;
                goto exit;
            }
        }

        for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
            // Loop through vector sizes
            fchk.vsz = g_arrVecSizes[vectorSize];
            dchk.vsz = g_arrVecSizes[vectorSize];

            for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
                // Loop over address spaces
                fchk.aspace = addressSpaceNames[addressSpace];
                dchk.aspace = addressSpaceNames[addressSpace];

                cl_uint pattern = 0xdeaddead;
                memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);

                error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                if (error) {
                    vlog_error( "Failure in clWriteArray\n" );
                    gFailCount++;
                    goto exit;
                }

                error = RunKernel(kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
                                       numVecs(count, vectorSize, aligned) ,
                                  runsOverBy(count, vectorSize, aligned));
                if (error) {
                    gFailCount++;
                    goto exit;
                }

                error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                if (error) {
                    vlog_error( "Failure in clReadArray\n" );
                    gFailCount++;
                    goto exit;
                }

                error = ThreadPool_Do(CheckF, threadCount, &fchk);
                if (error) {
                            gFailCount++;
                            goto exit;
                        }

                if (gTestDouble) {
                    memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);

                    error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                    if (error) {
                        vlog_error( "Failure in clWriteArray\n" );
                        gFailCount++;
                        goto exit;
                    }

                    error = RunKernel(doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
                                      numVecs(count, vectorSize, aligned),
                                      runsOverBy(count, vectorSize, aligned));
                    if (error) {
                        gFailCount++;
                        goto exit;
                    }

                    error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                    if (error) {
                        vlog_error( "Failure in clReadArray\n" );
                        gFailCount++;
                        goto exit;
                    }

                    error = ThreadPool_Do(CheckD, threadCount, &dchk);
                    if (error) {
                                gFailCount++;
                                goto exit;
                            }
                        }
                    }
                }

        if( ((i+blockCount) & ~printMask) == (i+blockCount) )
        {
            vlog( "." );
            fflush( stdout );
        }
    }  // end last case

    loopCount = count == blockCount ? 1 : 100;
    if( gReportTimes )
    {
        //Init the input stream
        cl_float *p = (cl_float *)gIn_single;
        for( j = 0; j < count; j++ )
            p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));

        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
        {
            vlog_error( "Failure in clWriteArray\n" );
            gFailCount++;
            goto exit;
        }

        if( gTestDouble )
        {
            //Init the input stream
            cl_double *q = (cl_double *)gIn_double;
            for( j = 0; j < count; j++ )
                q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));

            if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
            {
                vlog_error( "Failure in clWriteArray\n" );
                gFailCount++;
                goto exit;
            }
        }

        //Run again for timing
        for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
        {
            uint64_t bestTime = -1ULL;
            for( j = 0; j < loopCount; j++ )
            {
                uint64_t startTime = ReadTime();


                if( (error = RunKernel( kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
                                       runsOverBy(count, vectorSize, aligned)) ) )
                {
                    gFailCount++;
                    goto exit;
                }

                if( (error = clFinish(gQueue)) )
                {
                    vlog_error( "Failure in clFinish\n" );
                    gFailCount++;
                    goto exit;
                }
                uint64_t currentTime = ReadTime() - startTime;
                if( currentTime < bestTime )
                    bestTime = currentTime;
                time[ vectorSize ] += currentTime;
            }
            if( bestTime < min_time[ vectorSize ] )
                min_time[ vectorSize ] = bestTime ;

            if( gTestDouble )
            {
                bestTime = -1ULL;
                for( j = 0; j < loopCount; j++ )
                {
                    uint64_t startTime = ReadTime();
                    if( (error = RunKernel( doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
                                           runsOverBy(count, vectorSize, aligned)) ) )
                    {
                        gFailCount++;
                        goto exit;
                    }

                    if( (error = clFinish(gQueue)) )
                    {
                        vlog_error( "Failure in clFinish\n" );
                        gFailCount++;
                        goto exit;
                    }
                    uint64_t currentTime = ReadTime() - startTime;
                    if( currentTime < bestTime )
                        bestTime = currentTime;
                    doubleTime[ vectorSize ] += currentTime;
                }
                if( bestTime < min_double_time[ vectorSize ] )
                    min_double_time[ vectorSize ] = bestTime;
            }
        }
    }

    if( 0 == gFailCount )
    {
        if( gWimpyMode )
        {
            vlog( "\tfloat: Wimp Passed\n" );
            if( gTestDouble )
                vlog( "\tdouble: Wimp Passed\n" );
        }
        else
        {
            vlog( "\tfloat Passed\n" );
            if( gTestDouble )
                vlog( "\tdouble Passed\n" );
        }
    }

    if( gReportTimes )
    {
        for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
            vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
                      "average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
        for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
            vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
                      "best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
        if( gTestDouble )
        {
            for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
                vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
                          "average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0],  (g_arrVecSizes[vectorSize])  );
            for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
                vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
                          "best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
        }
    }

exit:
    //clean up
    for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
    {
        for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
            clReleaseKernel( kernels[ vectorSize ][ addressSpace ] );
            clReleaseProgram( programs[ vectorSize ][ addressSpace ] );
            if( gTestDouble )
            {
                clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
                clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
            }
        }
    }

    gTestCount++;
    return error;
}

int Test_vStoreaHalf_private( f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
{
    int vectorSize, error;
    cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
    cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];

    uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    memset( min_time, -1, sizeof( min_time ) );
    cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
    cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
    uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
    memset( min_double_time, -1, sizeof( min_double_time ) );

    bool aligned = true;

    vlog( "Testing vstorea_half%s\n", roundName );
    fflush( stdout );

    int minVectorSize = kMinVectorSize;
    // There is no aligned scalar vstorea_half
    if( 0 == minVectorSize )
        minVectorSize = 1;

    //Loop over vector sizes
    for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
    {
        const char *source[] = {
            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
            "}\n"
        };

        const char *source_v3[] = {
            "__kernel void test( __global float3 *p, __global half *f )\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   vstorea_half3",roundName,"( p[i], i, f );\n"
            "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
            "}\n"
        };

        const char *source_private[] = {
            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
            "   size_t i = get_global_id(0);\n"
            "   data = p[i];\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
            "}\n"
        };

        const char *source_private_v3[] = {
            "__kernel void test( __global float3 *p, __global half *f )\n"
            "{\n"
            "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
            "   size_t i = get_global_id(0);\n"
            "   data = p[i];\n"
            "   vstorea_half3",roundName,"( data, i, f );\n"
            "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
            "}\n"
        };

        char local_buf_size[10];
        sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
        const char *source_local[] = {
            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   data[lid] = p[i];\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
            "}\n"
        };

        const char *source_local_v3[] = {
            "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   data[lid] = p[i];\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
            "   vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
            "}\n"
        };

        const char *double_source[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
            "}\n"
        };

        const char *double_source_v3[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   size_t i = get_global_id(0);\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
            "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
            "}\n"
        };

        const char *double_source_private[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
            "   size_t i = get_global_id(0);\n"
            "   data = p[i];\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
            "}\n"
        };

        const char *double_source_private_v3[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
            "   size_t i = get_global_id(0);\n"
            "   data = p[i];\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
            "   vstore_half",roundName,"( ((__global  double *)p)[4*i+3], 4*i+3, f);\n"
            "}\n"
        };

        const char *double_source_local[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   data[lid] = p[i];\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
            "}\n"
        };

        const char *double_source_local_v3[] = {
            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
            "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
            "{\n"
            "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
            "   size_t i = get_global_id(0);\n"
            "   size_t lid = get_local_id(0);\n"
            "   data[lid] = p[i];\n"
            "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
            "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
            "}\n"
        };

        if(g_arrVecSizes[vectorSize] == 3) {
            programs[vectorSize][0] = MakeProgram( source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
            if( NULL == programs[ vectorSize ][0] )
            {
                gFailCount++;
                return -1;
            }
        } else {
            programs[vectorSize][0] = MakeProgram( source, sizeof(source) / sizeof( source[0]) );
            if( NULL == programs[ vectorSize ][0] )
            {
                gFailCount++;
                return -1;
            }
        }

        kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
        if( NULL == kernels[vectorSize][0] )
        {
            gFailCount++;
            vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
            return error;
        }

        if(g_arrVecSizes[vectorSize] == 3) {
            programs[vectorSize][1] = MakeProgram( source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) );
            if( NULL == programs[ vectorSize ][1] )
            {
                gFailCount++;
                return -1;
            }
        } else {
            programs[vectorSize][1] = MakeProgram( source_private, sizeof(source_private) / sizeof( source_private[0]) );
            if( NULL == programs[ vectorSize ][1] )
            {
                gFailCount++;
                return -1;
            }
        }

        kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
        if( NULL == kernels[vectorSize][1] )
        {
            gFailCount++;
            vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
            return error;
        }

        if(g_arrVecSizes[vectorSize] == 3) {
            programs[vectorSize][2] = MakeProgram( source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) );
            if( NULL == programs[ vectorSize ][2] )
            {
                gFailCount++;
                return -1;
            }
        } else {
            programs[vectorSize][2] = MakeProgram( source_local, sizeof(source_local) / sizeof( source_local[0]) );
            if( NULL == programs[ vectorSize ][2] )
            {
                gFailCount++;
                return -1;
            }
        }

        kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
        if( NULL == kernels[vectorSize][2] )
        {
            gFailCount++;
            vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
            return error;
        }

        if( gTestDouble )
        {
            if(g_arrVecSizes[vectorSize] == 3) {
                doublePrograms[vectorSize][0] = MakeProgram( double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
                if( NULL == doublePrograms[ vectorSize ][0] )
                {
                    gFailCount++;
                    return -1;
                }
            } else {
                doublePrograms[vectorSize][0] = MakeProgram( double_source, sizeof(double_source) / sizeof( double_source[0]) );
                if( NULL == doublePrograms[ vectorSize ][0] )
                {
                    gFailCount++;
                    return -1;
                }
            }

            doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
            if( NULL == kernels[vectorSize][0] )
            {
                gFailCount++;
                vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
                return error;
            }

            if(g_arrVecSizes[vectorSize] == 3) {
                doublePrograms[vectorSize][1] = MakeProgram( double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) );
                if( NULL == doublePrograms[ vectorSize ][1] )
                {
                    gFailCount++;
                    return -1;
                }
            } else {
                doublePrograms[vectorSize][1] = MakeProgram( double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) );
                if( NULL == doublePrograms[ vectorSize ][1] )
                {
                    gFailCount++;
                    return -1;
                }
            }

            doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
            if( NULL == kernels[vectorSize][1] )
            {
                gFailCount++;
                vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
                return error;
            }

            if(g_arrVecSizes[vectorSize] == 3) {
                doublePrograms[vectorSize][2] = MakeProgram( double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) );
                if( NULL == doublePrograms[ vectorSize ][2] )
                {
                    gFailCount++;
                    return -1;
                }
            } else {
                doublePrograms[vectorSize][2] = MakeProgram( double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) );
                if( NULL == doublePrograms[ vectorSize ][2] )
                {
                    gFailCount++;
                    return -1;
                }
            }

            doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
            if( NULL == kernels[vectorSize][2] )
            {
                gFailCount++;
                vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
                return error;
            }
        }
    }

    // Figure out how many elements are in a work block
    size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
    size_t blockCount = BUFFER_SIZE / elementSize;
    uint64_t lastCase = 1ULL << (8*sizeof(float));
    size_t stride = blockCount;

    if (gWimpyMode)
        stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;

    // we handle 64-bit types a bit differently.
    if( lastCase == 0 )
        lastCase = 0x100000000ULL;
    uint64_t i, j;
    error = 0;
    uint64_t printMask = (lastCase >> 4) - 1;
    cl_uint count = 0;
    int addressSpace;
    size_t loopCount;
    cl_uint threadCount = GetThreadCount();

    ComputeReferenceInfoF fref;
    fref.x = (float *)gIn_single;
    fref.r = (cl_ushort *)gOut_half_reference;
    fref.f = referenceFunc;
    fref.lim = blockCount;
    fref.count = (blockCount + threadCount - 1) / threadCount;

    CheckResultInfoF fchk;
    fchk.x = (const float *)gIn_single;
    fchk.r = (const cl_ushort *)gOut_half_reference;
    fchk.s = (const cl_ushort *)gOut_half;
    fchk.f = referenceFunc;
    fchk.lim = blockCount;
    fchk.count = (blockCount + threadCount - 1) / threadCount;

    ComputeReferenceInfoD dref;
    dref.x = (double *)gIn_double;
    dref.r = (cl_ushort *)gOut_half_reference_double;
    dref.f = doubleReferenceFunc;
    dref.lim = blockCount;
    dref.count = (blockCount + threadCount - 1) / threadCount;

    CheckResultInfoD dchk;
    dchk.x = (const double *)gIn_double;
    dchk.r = (const cl_ushort *)gOut_half_reference_double;
    dchk.s = (const cl_ushort *)gOut_half;
    dchk.f = doubleReferenceFunc;
    dchk.lim = blockCount;
    dchk.count = (blockCount + threadCount - 1) / threadCount;

    for( i = 0; i < (uint64_t)lastCase; i += stride )
    {
        count = (cl_uint) MIN( blockCount, lastCase - i );
        fref.i = i;
        dref.i = i;

        // Create the input and reference
        ThreadPool_Do(ReferenceF, threadCount, &fref);

        error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
        if (error) {
            vlog_error( "Failure in clWriteArray\n" );
            gFailCount++;
            goto exit;
        }

        if (gTestDouble) {
            ThreadPool_Do(ReferenceD, threadCount, &dref);

            error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
            if (error) {
                vlog_error( "Failure in clWriteArray\n" );
                gFailCount++;
                goto exit;
            }
        }

        for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
            // Loop over vector legths
            fchk.vsz = g_arrVecSizes[vectorSize];
            dchk.vsz = g_arrVecSizes[vectorSize];

            for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
                // Loop over address spaces
                fchk.aspace = addressSpaceNames[addressSpace];
                dchk.aspace = addressSpaceNames[addressSpace];

                cl_uint pattern = 0xdeaddead;
                memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);

                error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                if (error) {
                    vlog_error( "Failure in clWriteArray\n" );
                    gFailCount++;
                    goto exit;
                }

                error = RunKernel(kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
                                  numVecs(count, vectorSize, aligned),
                                  runsOverBy(count, vectorSize, aligned));
                if (error) {
                    gFailCount++;
                    goto exit;
                }

                error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                if (error) {
                    vlog_error( "Failure in clReadArray\n" );
                    gFailCount++;
                    goto exit;
                }

                error = ThreadPool_Do(CheckF, threadCount, &fchk);
                if (error) {
                            gFailCount++;
                            goto exit;
                        }

                if (gTestDouble) {
                    memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);

                    error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                    if (error) {
                        vlog_error( "Failure in clWriteArray\n" );
                        gFailCount++;
                        goto exit;
                    }

                    error = RunKernel(doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
                                      numVecs(count, vectorSize, aligned),
                                      runsOverBy(count, vectorSize, aligned));
                    if (error) {
                        gFailCount++;
                        goto exit;
                    }

                    error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
                    if (error) {
                        vlog_error( "Failure in clReadArray\n" );
                        gFailCount++;
                        goto exit;
                    }

                    error = ThreadPool_Do(CheckD, threadCount, &dchk);
                    if (error) {
                                gFailCount++;
                                goto exit;
                            }
                        }
                    }
        }  // end for vector size

        if( ((i+blockCount) & ~printMask) == (i+blockCount) ) {
            vlog( "." );
            fflush( stdout );
        }
    }  // for end lastcase

    loopCount = count == blockCount ? 1 : 100;
    if( gReportTimes )
    {
        //Init the input stream
        cl_float *p = (cl_float *)gIn_single;
        for( j = 0; j < count; j++ )
            p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));

        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
        {
            vlog_error( "Failure in clWriteArray\n" );
            gFailCount++;
            goto exit;
        }

        if( gTestDouble )
        {
            //Init the input stream
            cl_double *q = (cl_double *)gIn_double;
            for( j = 0; j < count; j++ )
                q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));

            if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
            {
                vlog_error( "Failure in clWriteArray\n" );
                gFailCount++;
                goto exit;
            }
        }

        //Run again for timing
        for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
        {
            uint64_t bestTime = -1ULL;
            for( j = 0; j < loopCount; j++ )
            {
                uint64_t startTime = ReadTime();
                if( (error = RunKernel( kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
                                       runsOverBy(count, vectorSize, aligned)) ) )
                {
                    gFailCount++;
                    goto exit;
                }

                if( (error = clFinish(gQueue)) )
                {
                    vlog_error( "Failure in clFinish\n" );
                    gFailCount++;
                    goto exit;
                }
                uint64_t currentTime = ReadTime() - startTime;
                if( currentTime < bestTime )
                    bestTime = currentTime;
                time[ vectorSize ] += currentTime;
            }
            if( bestTime < min_time[ vectorSize ] )
                min_time[ vectorSize ] = bestTime ;

            if( gTestDouble )
            {
                bestTime = -1ULL;
                for( j = 0; j < loopCount; j++ )
                {
                    uint64_t startTime = ReadTime();
                    if( (error = RunKernel( doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
                                           runsOverBy(count, vectorSize, aligned)) ) )
                    {
                        gFailCount++;
                        goto exit;
                    }

                    if( (error = clFinish(gQueue)) )
                    {
                        vlog_error( "Failure in clFinish\n" );
                        gFailCount++;
                        goto exit;
                    }
                    uint64_t currentTime = ReadTime() - startTime;
                    if( currentTime < bestTime )
                        bestTime = currentTime;
                    doubleTime[ vectorSize ] += currentTime;
                }
                if( bestTime < min_double_time[ vectorSize ] )
                    min_double_time[ vectorSize ] = bestTime;
            }
        }
    }

    if( gWimpyMode )
    {
        vlog( "\tfloat: Wimp Passed\n" );

        if( gTestDouble )
            vlog( "\tdouble: Wimp Passed\n" );
    }
    else
    {
        vlog( "\tfloat Passed\n" );
        if( gTestDouble )
            vlog( "\tdouble Passed\n" );
    }

    if( gReportTimes )
    {
        for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
            vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
                      "average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
        for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
            vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
                      "best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
        if( gTestDouble )
        {
            for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
                vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
                          "average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
            for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
                vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
                          "best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
        }
    }

exit:
    //clean up
    for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
    {
        for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
            clReleaseKernel( kernels[ vectorSize ][addressSpace] );
            clReleaseProgram( programs[ vectorSize ][addressSpace] );
            if( gTestDouble )
            {
                clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
                clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
            }
        }
    }

    gTestCount++;
    return error;
}