Files
OpenCL-CTS/test_conformance/half/Test_vStoreHalf.c
Kevin Petit 05a11d8e49 Synchronise with Khronos-private Gitlab branch
The maintenance of the conformance tests is moving to Github.

This commit contains all the changes that have been done in
Gitlab since the first public release of the conformance tests.

Signed-off-by: Kevin Petit kevin.petit@arm.com
2019-03-05 16:24:34 +00:00

1948 lines
68 KiB
C

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "../../test_common/harness/compat.h"
#include "../../test_common/harness/kernelHelpers.h"
#include <string.h>
#include "cl_utils.h"
#include "tests.h"
extern const char *addressSpaceNames[];
typedef struct ComputeReferenceInfoF_
{
float *x;
cl_ushort *r;
f2h f;
cl_ulong i;
cl_uint lim;
cl_uint count;
} ComputeReferenceInfoF;
typedef struct ComputeReferenceInfoD_
{
double *x;
cl_ushort *r;
d2h f;
cl_ulong i;
cl_uint lim;
cl_uint count;
} ComputeReferenceInfoD;
typedef struct CheckResultInfoF_
{
const float *x;
const cl_ushort *r;
const cl_ushort *s;
f2h f;
const char *aspace;
cl_uint lim;
cl_uint count;
int vsz;
} CheckResultInfoF;
typedef struct CheckResultInfoD_
{
const double *x;
const cl_ushort *r;
const cl_ushort *s;
d2h f;
const char *aspace;
cl_uint lim;
cl_uint count;
int vsz;
} CheckResultInfoD;
static cl_int
ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
{
ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
cl_uint lim = cri->lim;
cl_uint count = cri->count;
cl_uint off = jid * count;
float *x = cri->x + off;
cl_ushort *r = cri->r + off;
f2h f = cri->f;
cl_ulong i = cri->i + off;
cl_uint j, rr;
if (off + count > lim)
count = lim - off;
for (j = 0; j < count; ++j) {
x[j] = as_float((cl_uint)(i + j));
r[j] = f(x[j]);
}
return 0;
}
static cl_int
CheckF(cl_uint jid, cl_uint tid, void *userInfo)
{
CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
cl_uint lim = cri->lim;
cl_uint count = cri->count;
cl_uint off = jid * count;
const float *x = cri->x + off;
const cl_ushort *r = cri->r + off;
const cl_ushort *s = cri->s + off;
f2h f = cri->f;
cl_uint j;
cl_ushort correct2 = f( 0.0f);
cl_ushort correct3 = f(-0.0f);
cl_int ret = 0;
if (off + count > lim)
count = lim - off;
if (!memcmp(r, s, count*sizeof(cl_ushort)))
return 0;
for (j = 0; j < count; j++) {
if (s[j] == r[j])
continue;
// Pass any NaNs
if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
continue;
// retry per section 6.5.3.3
if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
continue;
// if reference result is subnormal, pass any zero
if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
continue;
vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, vector_size = %d, address_space = %s\n",
j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
ret = 1;
break;
}
return ret;
}
static cl_int
ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
{
ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
cl_uint lim = cri->lim;
cl_uint count = cri->count;
cl_uint off = jid * count;
double *x = cri->x + off;
cl_ushort *r = cri->r + off;
d2h f = cri->f;
cl_uint j;
cl_ulong i = cri->i + off;
if (off + count > lim)
count = lim - off;
for (j = 0; j < count; ++j) {
x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
r[j] = f(x[j]);
}
return 0;
}
static cl_int
CheckD(cl_uint jid, cl_uint tid, void *userInfo)
{
CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
cl_uint lim = cri->lim;
cl_uint count = cri->count;
cl_uint off = jid * count;
const double *x = cri->x + off;
const cl_ushort *r = cri->r + off;
const cl_ushort *s = cri->s + off;
d2h f = cri->f;
cl_uint j;
cl_ushort correct2 = f( 0.0);
cl_ushort correct3 = f(-0.0);
cl_int ret = 0;
if (off + count > lim)
count = lim - off;
if (!memcmp(r, s, count*sizeof(cl_ushort)))
return 0;
for (j = 0; j < count; j++) {
if (s[j] == r[j])
continue;
// Pass any NaNs
if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
continue;
if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
continue;
// if reference result is subnormal, pass any zero result
if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
continue;
vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n",
j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
ret = 1;
break;
}
return ret;
}
static cl_ushort float2half_rte( float f );
static cl_ushort float2half_rtz( float f );
static cl_ushort float2half_rtp( float f );
static cl_ushort float2half_rtn( float f );
static cl_ushort double2half_rte( double f );
static cl_ushort double2half_rtz( double f );
static cl_ushort double2half_rtp( double f );
static cl_ushort double2half_rtn( double f );
static cl_ushort
float2half_rte( float f )
{
union{ float f; cl_uint u; } u = {f};
cl_uint sign = (u.u >> 16) & 0x8000;
float x = fabsf(f);
//Nan
if( x != x )
{
u.u >>= (24-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
// overflow
if( x >= MAKE_HEX_FLOAT(0x1.ffep15f, 0x1ffeL, 3) )
return 0x7c00 | sign;
// underflow
if( x <= MAKE_HEX_FLOAT(0x1.0p-25f, 0x1L, -25) )
return sign; // The halfway case can return 0x0001 or 0. 0 is even.
// very small
if( x < MAKE_HEX_FLOAT(0x1.8p-24f, 0x18L, -28) )
return sign | 1;
// half denormal
if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
{
u.f = x * MAKE_HEX_FLOAT(0x1.0p-125f, 0x1L, -125);
return sign | u.u;
}
u.f *= MAKE_HEX_FLOAT(0x1.0p13f, 0x1L, 13);
u.u &= 0x7f800000;
x += u.f;
u.f = x - u.f;
u.f *= MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);
return (u.u >> (24-11)) | sign;
}
static cl_ushort
float2half_rtz( float f )
{
union{ float f; cl_uint u; } u = {f};
cl_uint sign = (u.u >> 16) & 0x8000;
float x = fabsf(f);
//Nan
if( x != x )
{
u.u >>= (24-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
// overflow
if( x >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) )
{
if( x == INFINITY )
return 0x7c00 | sign;
return 0x7bff | sign;
}
// underflow
if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
return sign; // The halfway case can return 0x0001 or 0. 0 is even.
// half denormal
if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
{
x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
return (cl_ushort)((int) x | sign);
}
u.u &= 0xFFFFE000U;
u.u -= 0x38000000U;
return (u.u >> (24-11)) | sign;
}
static cl_ushort
float2half_rtp( float f )
{
union{ float f; cl_uint u; } u = {f};
cl_uint sign = (u.u >> 16) & 0x8000;
float x = fabsf(f);
//Nan
if( x != x )
{
u.u >>= (24-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
// overflow
if( f > MAKE_HEX_FLOAT(0x1.ffcp15f, 0x1ffcL, 3) )
return 0x7c00;
if( f <= MAKE_HEX_FLOAT(-0x1.0p16f, -0x1L, 16) )
{
if( f == -INFINITY )
return 0xfc00;
return 0xfbff;
}
// underflow
if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
{
if( f > 0 )
return 1;
return sign;
}
// half denormal
if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
{
x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
int r = (int) x;
r += (float) r != x && f > 0.0f;
return (cl_ushort)( r | sign);
}
float g = u.f;
u.u &= 0xFFFFE000U;
if( g > u.f )
u.u += 0x00002000U;
u.u -= 0x38000000U;
return (u.u >> (24-11)) | sign;
}
static cl_ushort
float2half_rtn( float f )
{
union{ float f; cl_uint u; } u = {f};
cl_uint sign = (u.u >> 16) & 0x8000;
float x = fabsf(f);
//Nan
if( x != x )
{
u.u >>= (24-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
// overflow
if( f >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) )
{
if( f == INFINITY )
return 0x7c00;
return 0x7bff;
}
if( f < MAKE_HEX_FLOAT(-0x1.ffcp15f, -0x1ffcL, 3) )
return 0xfc00;
// underflow
if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
{
if( f < 0 )
return 0x8001;
return sign;
}
// half denormal
if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
{
x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
int r = (int) x;
r += (float) r != x && f < 0.0f;
return (cl_ushort)( r | sign);
}
u.u &= 0xFFFFE000U;
if( u.f > f )
u.u += 0x00002000U;
u.u -= 0x38000000U;
return (u.u >> (24-11)) | sign;
}
static cl_ushort
double2half_rte( double f )
{
union{ double f; cl_ulong u; } u = {f};
cl_ulong sign = (u.u >> 48) & 0x8000;
double x = fabs(f);
//Nan
if( x != x )
{
u.u >>= (53-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
// overflow
if( x >= MAKE_HEX_DOUBLE(0x1.ffep15, 0x1ffeLL, 3) )
return 0x7c00 | sign;
// underflow
if( x <= MAKE_HEX_DOUBLE(0x1.0p-25, 0x1LL, -25) )
return sign; // The halfway case can return 0x0001 or 0. 0 is even.
// very small
if( x < MAKE_HEX_DOUBLE(0x1.8p-24, 0x18LL, -28) )
return sign | 1;
// half denormal
if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
{
u.f = x * MAKE_HEX_DOUBLE(0x1.0p-1050, 0x1LL, -1050);
return sign | u.u;
}
u.f *= MAKE_HEX_DOUBLE(0x1.0p42, 0x1LL, 42);
u.u &= 0x7ff0000000000000ULL;
x += u.f;
u.f = x - u.f;
u.f *= MAKE_HEX_DOUBLE(0x1.0p-1008, 0x1LL, -1008);
return (u.u >> (53-11)) | sign;
}
static cl_ushort
double2half_rtz( double f )
{
union{ double f; cl_ulong u; } u = {f};
cl_ulong sign = (u.u >> 48) & 0x8000;
double x = fabs(f);
//Nan
if( x != x )
{
u.u >>= (53-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
if( x == INFINITY )
return 0x7c00 | sign;
// overflow
if( x >= MAKE_HEX_DOUBLE(0x1.0p16, 0x1LL, 16) )
return 0x7bff | sign;
// underflow
if( x < MAKE_HEX_DOUBLE(0x1.0p-24, 0x1LL, -24) )
return sign; // The halfway case can return 0x0001 or 0. 0 is even.
// half denormal
if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
{
x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
return (cl_ushort)((int) x | sign);
}
u.u &= 0xFFFFFC0000000000ULL;
u.u -= 0x3F00000000000000ULL;
return (u.u >> (53-11)) | sign;
}
static cl_ushort
double2half_rtp( double f )
{
union{ double f; cl_ulong u; } u = {f};
cl_ulong sign = (u.u >> 48) & 0x8000;
double x = fabs(f);
//Nan
if( x != x )
{
u.u >>= (53-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
// overflow
if( f > MAKE_HEX_DOUBLE(0x1.ffcp15, 0x1ffcLL, 3) )
return 0x7c00;
if( f <= MAKE_HEX_DOUBLE(-0x1.0p16, -0x1LL, 16) )
{
if( f == -INFINITY )
return 0xfc00;
return 0xfbff;
}
// underflow
if( x < MAKE_HEX_DOUBLE(0x1.0p-24, 0x1LL, -24) )
{
if( f > 0 )
return 1;
return sign;
}
// half denormal
if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
{
x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
int r = (int) x;
if( 0 == sign )
r += (double) r != x;
return (cl_ushort)( r | sign);
}
double g = u.f;
u.u &= 0xFFFFFC0000000000ULL;
if( g != u.f && 0 == sign)
u.u += 0x0000040000000000ULL;
u.u -= 0x3F00000000000000ULL;
return (u.u >> (53-11)) | sign;
}
static cl_ushort
double2half_rtn( double f )
{
union{ double f; cl_ulong u; } u = {f};
cl_ulong sign = (u.u >> 48) & 0x8000;
double x = fabs(f);
//Nan
if( x != x )
{
u.u >>= (53-11);
u.u &= 0x7fff;
u.u |= 0x0200; //silence the NaN
return u.u | sign;
}
// overflow
if( f >= MAKE_HEX_DOUBLE(0x1.0p16, 0x1LL, 16) )
{
if( f == INFINITY )
return 0x7c00;
return 0x7bff;
}
if( f < MAKE_HEX_DOUBLE(-0x1.ffcp15, -0x1ffcLL, 3) )
return 0xfc00;
// underflow
if( x < MAKE_HEX_DOUBLE(0x1.0p-24, 0x1LL, -24) )
{
if( f < 0 )
return 0x8001;
return sign;
}
// half denormal
if( x < MAKE_HEX_DOUBLE(0x1.0p-14, 0x1LL, -14) )
{
x *= MAKE_HEX_DOUBLE(0x1.0p24, 0x1LL, 24);
int r = (int) x;
if( sign )
r += (double) r != x;
return (cl_ushort)( r | sign);
}
double g = u.f;
u.u &= 0xFFFFFC0000000000ULL;
if( g < u.f && sign)
u.u += 0x0000040000000000ULL;
u.u -= 0x3F00000000000000ULL;
return (u.u >> (53-11)) | sign;
}
int Test_vstore_half( void )
{
switch (get_default_rounding_mode(gDevice))
{
case CL_FP_ROUND_TO_ZERO:
return Test_vStoreHalf_private(float2half_rtz, double2half_rte, "");
case 0:
return -1;
default:
return Test_vStoreHalf_private(float2half_rte, double2half_rte, "");
}
}
int Test_vstore_half_rte( void )
{
return Test_vStoreHalf_private(float2half_rte, double2half_rte, "_rte");
}
int Test_vstore_half_rtz( void )
{
return Test_vStoreHalf_private(float2half_rtz, double2half_rtz, "_rtz");
}
int Test_vstore_half_rtp( void )
{
return Test_vStoreHalf_private(float2half_rtp, double2half_rtp, "_rtp");
}
int Test_vstore_half_rtn( void )
{
return Test_vStoreHalf_private(float2half_rtn, double2half_rtn, "_rtn");
}
int Test_vstorea_half( void )
{
switch (get_default_rounding_mode(gDevice))
{
case CL_FP_ROUND_TO_ZERO:
return Test_vStoreaHalf_private(float2half_rtz, double2half_rte, "");
case 0:
return -1;
default:
return Test_vStoreaHalf_private(float2half_rte, double2half_rte, "");
}
}
int Test_vstorea_half_rte( void )
{
return Test_vStoreaHalf_private(float2half_rte, double2half_rte, "_rte");
}
int Test_vstorea_half_rtz( void )
{
return Test_vStoreaHalf_private(float2half_rtz, double2half_rtz, "_rtz");
}
int Test_vstorea_half_rtp( void )
{
return Test_vStoreaHalf_private(float2half_rtp, double2half_rtp, "_rtp");
}
int Test_vstorea_half_rtn( void )
{
return Test_vStoreaHalf_private(float2half_rtn, double2half_rtn, "_rtn");
}
#pragma mark -
int Test_vStoreHalf_private( f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
{
int vectorSize, error;
cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
memset( min_time, -1, sizeof( min_time ) );
cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
memset( min_double_time, -1, sizeof( min_double_time ) );
vlog( "Testing vstore_half%s\n", roundName );
fflush( stdout );
bool aligned= false;
for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
const char *source[] = {
"__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
"}\n"
};
const char *source_v3[] = {
"__kernel void test( __global float *p, __global half *f,\n"
" uint extra_last_thread)\n"
"{\n"
" size_t i = get_global_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" size_t adjust = 0;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" adjust = 3-extra_last_thread;\n"
" } "
" vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n"
"}\n"
};
const char *source_private_store[] = {
"__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __private ushort data[16];\n"
" size_t i = get_global_id(0);\n"
" size_t offset = 0;\n"
" size_t vecsize = vec_step(p[i]);\n"
" vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
" for(offset = 0; offset < vecsize; offset++)\n"
" {\n"
" vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
" }\n"
"}\n"
};
const char *source_private_store_v3[] = {
"__kernel void test( __global float *p, __global half *f,\n"
" uint extra_last_thread )\n"
"{\n"
" __private ushort data[4];\n"
" size_t i = get_global_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" size_t adjust = 0;\n"
" size_t offset = 0;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" adjust = 3-extra_last_thread;\n"
" } "
" vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
" for(offset = 0; offset < 3; offset++)\n"
" {\n"
" vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n"
" }\n"
"}\n"
};
char local_buf_size[10];
sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
const char *source_local_store[] = {
"__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __local ushort data[16*", local_buf_size, "];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" size_t lsize = get_local_size(0);\n"
" size_t vecsize = vec_step(p[0]);\n"
" event_t async_event;\n"
" vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
" async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
" wait_group_events(1, &async_event);\n"
"}\n"
};
const char *source_local_store_v3[] = {
"__kernel void test( __global float *p, __global half *f,\n"
" uint extra_last_thread )\n"
"{\n"
" __local ushort data[3*(", local_buf_size, "+1)];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" size_t adjust = 0;\n"
" size_t lsize = get_local_size(0);\n"
" event_t async_event;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" adjust = 3-extra_last_thread;\n"
" } "
" vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
" async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
" wait_group_events(1, &async_event);\n"
"}\n"
};
const char *double_source[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
"}\n"
};
const char *double_source_private_store[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __private ushort data[16];\n"
" size_t i = get_global_id(0);\n"
" size_t offset = 0;\n"
" size_t vecsize = vec_step(p[i]);\n"
" vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
" for(offset = 0; offset < vecsize; offset++)\n"
" {\n"
" vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
" }\n"
"}\n"
};
const char *double_source_local_store[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __local ushort data[16*", local_buf_size, "];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" size_t vecsize = vec_step(p[0]);\n"
" size_t lsize = get_local_size(0);\n"
" event_t async_event;\n"
" vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
" async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
" wait_group_events(1, &async_event);\n"
"}\n"
};
const char *double_source_v3[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double *p, __global half *f ,\n"
" uint extra_last_thread)\n"
"{\n"
" size_t i = get_global_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" size_t adjust = 0;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" adjust = 3-extra_last_thread;\n"
" } "
" vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n"
"}\n"
};
const char *double_source_private_store_v3[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double *p, __global half *f,\n"
" uint extra_last_thread )\n"
"{\n"
" __private ushort data[4];\n"
" size_t i = get_global_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" size_t adjust = 0;\n"
" size_t offset = 0;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" adjust = 3-extra_last_thread;\n"
" } "
" vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
" for(offset = 0; offset < 3; offset++)\n"
" {\n"
" vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n"
" }\n"
"}\n"
};
const char *double_source_local_store_v3[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double *p, __global half *f,\n"
" uint extra_last_thread )\n"
"{\n"
" __local ushort data[3*(", local_buf_size, "+1)];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" size_t adjust = 0;\n"
" size_t lsize = get_local_size(0);\n"
" event_t async_event;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" adjust = 3-extra_last_thread;\n"
" }\n "
" vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
" async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
" wait_group_events(1, &async_event);\n"
"}\n"
};
if(g_arrVecSizes[vectorSize] == 3) {
programs[vectorSize][0] = MakeProgram( source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
} else {
programs[vectorSize][0] = MakeProgram( source, sizeof(source) / sizeof( source[0]) );
}
if( NULL == programs[ vectorSize ][0] )
{
gFailCount++;
return -1;
}
kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
if( NULL == kernels[vectorSize][0] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3) {
programs[vectorSize][1] = MakeProgram( source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) );
} else {
programs[vectorSize][1] = MakeProgram( source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) );
}
if( NULL == programs[ vectorSize ][1] )
{
gFailCount++;
return -1;
}
kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
if( NULL == kernels[vectorSize][1] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3) {
programs[vectorSize][2] = MakeProgram( source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) );
if( NULL == programs[ vectorSize ][2] )
{
unsigned q;
for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++)
vlog_error("%s", source_local_store_v3[q]);
gFailCount++;
return -1;
}
} else {
programs[vectorSize][2] = MakeProgram( source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) );
if( NULL == programs[ vectorSize ][2] )
{
unsigned q;
for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++)
vlog_error("%s", source_local_store[q]);
gFailCount++;
return -1;
}
}
kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
if( NULL == kernels[vectorSize][2] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
return error;
}
if( gTestDouble )
{
if(g_arrVecSizes[vectorSize] == 3) {
doublePrograms[vectorSize][0] = MakeProgram( double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
} else {
doublePrograms[vectorSize][0] = MakeProgram( double_source, sizeof(double_source) / sizeof( double_source[0]) );
}
if( NULL == doublePrograms[ vectorSize ][0] )
{
gFailCount++;
return -1;
}
doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
if( NULL == kernels[vectorSize][0] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3)
doublePrograms[vectorSize][1] = MakeProgram( double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) );
else
doublePrograms[vectorSize][1] = MakeProgram( double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) );
if( NULL == doublePrograms[ vectorSize ][1] )
{
gFailCount++;
return -1;
}
doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
if( NULL == kernels[vectorSize][1] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3) {
doublePrograms[vectorSize][2] = MakeProgram( double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) );
} else {
doublePrograms[vectorSize][2] = MakeProgram( double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) );
}
if( NULL == doublePrograms[ vectorSize ][2] )
{
gFailCount++;
return -1;
}
doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
if( NULL == kernels[vectorSize][2] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
return error;
}
}
} // end for vector size
// Figure out how many elements are in a work block
size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
size_t stride = blockCount;
if (gWimpyMode)
stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
// we handle 64-bit types a bit differently.
if( lastCase == 0 )
lastCase = 0x100000000ULL;
uint64_t i, j;
error = 0;
uint64_t printMask = (lastCase >> 4) - 1;
cl_uint count = 0;
int addressSpace;
size_t loopCount;
cl_uint threadCount = GetThreadCount();
ComputeReferenceInfoF fref;
fref.x = (float *)gIn_single;
fref.r = (cl_ushort *)gOut_half_reference;
fref.f = referenceFunc;
fref.lim = blockCount;
fref.count = (blockCount + threadCount - 1) / threadCount;
CheckResultInfoF fchk;
fchk.x = (const float *)gIn_single;
fchk.r = (const cl_ushort *)gOut_half_reference;
fchk.s = (const cl_ushort *)gOut_half;
fchk.f = referenceFunc;
fchk.lim = blockCount;
fchk.count = (blockCount + threadCount - 1) / threadCount;
ComputeReferenceInfoD dref;
dref.x = (double *)gIn_double;
dref.r = (cl_ushort *)gOut_half_reference_double;
dref.f = doubleReferenceFunc;
dref.lim = blockCount;
dref.count = (blockCount + threadCount - 1) / threadCount;
CheckResultInfoD dchk;
dchk.x = (const double *)gIn_double;
dchk.r = (const cl_ushort *)gOut_half_reference_double;
dchk.s = (const cl_ushort *)gOut_half;
dchk.f = doubleReferenceFunc;
dchk.lim = blockCount;
dchk.count = (blockCount + threadCount - 1) / threadCount;
for( i = 0; i < lastCase; i += stride )
{
count = (cl_uint) MIN( blockCount, lastCase - i );
fref.i = i;
dref.i = i;
// Compute the input and reference
ThreadPool_Do(ReferenceF, threadCount, &fref);
error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteBuffer\n" );
gFailCount++;
goto exit;
}
if (gTestDouble) {
ThreadPool_Do(ReferenceD, threadCount, &dref);
error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteBuffer\n" );
gFailCount++;
goto exit;
}
}
for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
// Loop through vector sizes
fchk.vsz = g_arrVecSizes[vectorSize];
dchk.vsz = g_arrVecSizes[vectorSize];
for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
// Loop over address spaces
fchk.aspace = addressSpaceNames[addressSpace];
dchk.aspace = addressSpaceNames[addressSpace];
cl_uint pattern = 0xdeaddead;
memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
error = RunKernel(kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
numVecs(count, vectorSize, aligned) ,
runsOverBy(count, vectorSize, aligned));
if (error) {
gFailCount++;
goto exit;
}
error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clReadArray\n" );
gFailCount++;
goto exit;
}
error = ThreadPool_Do(CheckF, threadCount, &fchk);
if (error) {
gFailCount++;
goto exit;
}
if (gTestDouble) {
memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
error = RunKernel(doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
numVecs(count, vectorSize, aligned),
runsOverBy(count, vectorSize, aligned));
if (error) {
gFailCount++;
goto exit;
}
error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clReadArray\n" );
gFailCount++;
goto exit;
}
error = ThreadPool_Do(CheckD, threadCount, &dchk);
if (error) {
gFailCount++;
goto exit;
}
}
}
}
if( ((i+blockCount) & ~printMask) == (i+blockCount) )
{
vlog( "." );
fflush( stdout );
}
} // end last case
loopCount = count == blockCount ? 1 : 100;
if( gReportTimes )
{
//Init the input stream
cl_float *p = (cl_float *)gIn_single;
for( j = 0; j < count; j++ )
p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
{
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
if( gTestDouble )
{
//Init the input stream
cl_double *q = (cl_double *)gIn_double;
for( j = 0; j < count; j++ )
q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
{
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
}
//Run again for timing
for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
uint64_t bestTime = -1ULL;
for( j = 0; j < loopCount; j++ )
{
uint64_t startTime = ReadTime();
if( (error = RunKernel( kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
runsOverBy(count, vectorSize, aligned)) ) )
{
gFailCount++;
goto exit;
}
if( (error = clFinish(gQueue)) )
{
vlog_error( "Failure in clFinish\n" );
gFailCount++;
goto exit;
}
uint64_t currentTime = ReadTime() - startTime;
if( currentTime < bestTime )
bestTime = currentTime;
time[ vectorSize ] += currentTime;
}
if( bestTime < min_time[ vectorSize ] )
min_time[ vectorSize ] = bestTime ;
if( gTestDouble )
{
bestTime = -1ULL;
for( j = 0; j < loopCount; j++ )
{
uint64_t startTime = ReadTime();
if( (error = RunKernel( doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
runsOverBy(count, vectorSize, aligned)) ) )
{
gFailCount++;
goto exit;
}
if( (error = clFinish(gQueue)) )
{
vlog_error( "Failure in clFinish\n" );
gFailCount++;
goto exit;
}
uint64_t currentTime = ReadTime() - startTime;
if( currentTime < bestTime )
bestTime = currentTime;
doubleTime[ vectorSize ] += currentTime;
}
if( bestTime < min_double_time[ vectorSize ] )
min_double_time[ vectorSize ] = bestTime;
}
}
}
if( 0 == gFailCount )
{
if( gWimpyMode )
{
vlog( "\tfloat: Wimp Passed\n" );
if( gTestDouble )
vlog( "\tdouble: Wimp Passed\n" );
}
else
{
vlog( "\tfloat Passed\n" );
if( gTestDouble )
vlog( "\tdouble Passed\n" );
}
}
if( gReportTimes )
{
for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
"average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
"best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
if( gTestDouble )
{
for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
"average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
"best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
}
}
exit:
//clean up
for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
clReleaseKernel( kernels[ vectorSize ][ addressSpace ] );
clReleaseProgram( programs[ vectorSize ][ addressSpace ] );
if( gTestDouble )
{
clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
}
}
}
gTestCount++;
return error;
}
int Test_vStoreaHalf_private( f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
{
int vectorSize, error;
cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
memset( min_time, -1, sizeof( min_time ) );
cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
memset( min_double_time, -1, sizeof( min_double_time ) );
bool aligned = true;
vlog( "Testing vstorea_half%s\n", roundName );
fflush( stdout );
int minVectorSize = kMinVectorSize;
// There is no aligned scalar vstorea_half
if( 0 == minVectorSize )
minVectorSize = 1;
//Loop over vector sizes
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
const char *source[] = {
"__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
"}\n"
};
const char *source_v3[] = {
"__kernel void test( __global float3 *p, __global half *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" vstorea_half3",roundName,"( p[i], i, f );\n"
" vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
"}\n"
};
const char *source_private[] = {
"__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __private float", vector_size_name_extensions[vectorSize], " data;\n"
" size_t i = get_global_id(0);\n"
" data = p[i];\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
"}\n"
};
const char *source_private_v3[] = {
"__kernel void test( __global float3 *p, __global half *f )\n"
"{\n"
" __private float", vector_size_name_extensions[vectorSize], " data;\n"
" size_t i = get_global_id(0);\n"
" data = p[i];\n"
" vstorea_half3",roundName,"( data, i, f );\n"
" vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
"}\n"
};
char local_buf_size[10];
sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
const char *source_local[] = {
"__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" data[lid] = p[i];\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
"}\n"
};
const char *source_local_v3[] = {
"__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" data[lid] = p[i];\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
" vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
"}\n"
};
const char *double_source[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
"}\n"
};
const char *double_source_v3[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
" vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
"}\n"
};
const char *double_source_private[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __private double", vector_size_name_extensions[vectorSize], " data;\n"
" size_t i = get_global_id(0);\n"
" data = p[i];\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
"}\n"
};
const char *double_source_private_v3[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __private double", vector_size_name_extensions[vectorSize], " data;\n"
" size_t i = get_global_id(0);\n"
" data = p[i];\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
" vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
"}\n"
};
const char *double_source_local[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" data[lid] = p[i];\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
"}\n"
};
const char *double_source_local_v3[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
"{\n"
" __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" data[lid] = p[i];\n"
" vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
" vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
"}\n"
};
if(g_arrVecSizes[vectorSize] == 3) {
programs[vectorSize][0] = MakeProgram( source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
if( NULL == programs[ vectorSize ][0] )
{
gFailCount++;
return -1;
}
} else {
programs[vectorSize][0] = MakeProgram( source, sizeof(source) / sizeof( source[0]) );
if( NULL == programs[ vectorSize ][0] )
{
gFailCount++;
return -1;
}
}
kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
if( NULL == kernels[vectorSize][0] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3) {
programs[vectorSize][1] = MakeProgram( source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) );
if( NULL == programs[ vectorSize ][1] )
{
gFailCount++;
return -1;
}
} else {
programs[vectorSize][1] = MakeProgram( source_private, sizeof(source_private) / sizeof( source_private[0]) );
if( NULL == programs[ vectorSize ][1] )
{
gFailCount++;
return -1;
}
}
kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
if( NULL == kernels[vectorSize][1] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3) {
programs[vectorSize][2] = MakeProgram( source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) );
if( NULL == programs[ vectorSize ][2] )
{
gFailCount++;
return -1;
}
} else {
programs[vectorSize][2] = MakeProgram( source_local, sizeof(source_local) / sizeof( source_local[0]) );
if( NULL == programs[ vectorSize ][2] )
{
gFailCount++;
return -1;
}
}
kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
if( NULL == kernels[vectorSize][2] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
return error;
}
if( gTestDouble )
{
if(g_arrVecSizes[vectorSize] == 3) {
doublePrograms[vectorSize][0] = MakeProgram( double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
if( NULL == doublePrograms[ vectorSize ][0] )
{
gFailCount++;
return -1;
}
} else {
doublePrograms[vectorSize][0] = MakeProgram( double_source, sizeof(double_source) / sizeof( double_source[0]) );
if( NULL == doublePrograms[ vectorSize ][0] )
{
gFailCount++;
return -1;
}
}
doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
if( NULL == kernels[vectorSize][0] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3) {
doublePrograms[vectorSize][1] = MakeProgram( double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) );
if( NULL == doublePrograms[ vectorSize ][1] )
{
gFailCount++;
return -1;
}
} else {
doublePrograms[vectorSize][1] = MakeProgram( double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) );
if( NULL == doublePrograms[ vectorSize ][1] )
{
gFailCount++;
return -1;
}
}
doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
if( NULL == kernels[vectorSize][1] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
return error;
}
if(g_arrVecSizes[vectorSize] == 3) {
doublePrograms[vectorSize][2] = MakeProgram( double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) );
if( NULL == doublePrograms[ vectorSize ][2] )
{
gFailCount++;
return -1;
}
} else {
doublePrograms[vectorSize][2] = MakeProgram( double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) );
if( NULL == doublePrograms[ vectorSize ][2] )
{
gFailCount++;
return -1;
}
}
doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
if( NULL == kernels[vectorSize][2] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
return error;
}
}
}
// Figure out how many elements are in a work block
size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
size_t blockCount = BUFFER_SIZE / elementSize;
uint64_t lastCase = 1ULL << (8*sizeof(float));
size_t stride = blockCount;
if (gWimpyMode)
stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
// we handle 64-bit types a bit differently.
if( lastCase == 0 )
lastCase = 0x100000000ULL;
uint64_t i, j;
error = 0;
uint64_t printMask = (lastCase >> 4) - 1;
cl_uint count = 0;
int addressSpace;
size_t loopCount;
cl_uint threadCount = GetThreadCount();
ComputeReferenceInfoF fref;
fref.x = (float *)gIn_single;
fref.r = (cl_ushort *)gOut_half_reference;
fref.f = referenceFunc;
fref.lim = blockCount;
fref.count = (blockCount + threadCount - 1) / threadCount;
CheckResultInfoF fchk;
fchk.x = (const float *)gIn_single;
fchk.r = (const cl_ushort *)gOut_half_reference;
fchk.s = (const cl_ushort *)gOut_half;
fchk.f = referenceFunc;
fchk.lim = blockCount;
fchk.count = (blockCount + threadCount - 1) / threadCount;
ComputeReferenceInfoD dref;
dref.x = (double *)gIn_double;
dref.r = (cl_ushort *)gOut_half_reference_double;
dref.f = doubleReferenceFunc;
dref.lim = blockCount;
dref.count = (blockCount + threadCount - 1) / threadCount;
CheckResultInfoD dchk;
dchk.x = (const double *)gIn_double;
dchk.r = (const cl_ushort *)gOut_half_reference_double;
dchk.s = (const cl_ushort *)gOut_half;
dchk.f = doubleReferenceFunc;
dchk.lim = blockCount;
dchk.count = (blockCount + threadCount - 1) / threadCount;
for( i = 0; i < (uint64_t)lastCase; i += stride )
{
count = (cl_uint) MIN( blockCount, lastCase - i );
fref.i = i;
dref.i = i;
// Create the input and reference
ThreadPool_Do(ReferenceF, threadCount, &fref);
error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
if (gTestDouble) {
ThreadPool_Do(ReferenceD, threadCount, &dref);
error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
}
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
// Loop over vector legths
fchk.vsz = g_arrVecSizes[vectorSize];
dchk.vsz = g_arrVecSizes[vectorSize];
for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
// Loop over address spaces
fchk.aspace = addressSpaceNames[addressSpace];
dchk.aspace = addressSpaceNames[addressSpace];
cl_uint pattern = 0xdeaddead;
memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
error = RunKernel(kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
numVecs(count, vectorSize, aligned),
runsOverBy(count, vectorSize, aligned));
if (error) {
gFailCount++;
goto exit;
}
error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clReadArray\n" );
gFailCount++;
goto exit;
}
error = ThreadPool_Do(CheckF, threadCount, &fchk);
if (error) {
gFailCount++;
goto exit;
}
if (gTestDouble) {
memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
error = RunKernel(doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
numVecs(count, vectorSize, aligned),
runsOverBy(count, vectorSize, aligned));
if (error) {
gFailCount++;
goto exit;
}
error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_ushort), gOut_half, 0, NULL, NULL);
if (error) {
vlog_error( "Failure in clReadArray\n" );
gFailCount++;
goto exit;
}
error = ThreadPool_Do(CheckD, threadCount, &dchk);
if (error) {
gFailCount++;
goto exit;
}
}
}
} // end for vector size
if( ((i+blockCount) & ~printMask) == (i+blockCount) ) {
vlog( "." );
fflush( stdout );
}
} // for end lastcase
loopCount = count == blockCount ? 1 : 100;
if( gReportTimes )
{
//Init the input stream
cl_float *p = (cl_float *)gIn_single;
for( j = 0; j < count; j++ )
p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
{
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
if( gTestDouble )
{
//Init the input stream
cl_double *q = (cl_double *)gIn_double;
for( j = 0; j < count; j++ )
q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
{
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
}
//Run again for timing
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
uint64_t bestTime = -1ULL;
for( j = 0; j < loopCount; j++ )
{
uint64_t startTime = ReadTime();
if( (error = RunKernel( kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
runsOverBy(count, vectorSize, aligned)) ) )
{
gFailCount++;
goto exit;
}
if( (error = clFinish(gQueue)) )
{
vlog_error( "Failure in clFinish\n" );
gFailCount++;
goto exit;
}
uint64_t currentTime = ReadTime() - startTime;
if( currentTime < bestTime )
bestTime = currentTime;
time[ vectorSize ] += currentTime;
}
if( bestTime < min_time[ vectorSize ] )
min_time[ vectorSize ] = bestTime ;
if( gTestDouble )
{
bestTime = -1ULL;
for( j = 0; j < loopCount; j++ )
{
uint64_t startTime = ReadTime();
if( (error = RunKernel( doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
runsOverBy(count, vectorSize, aligned)) ) )
{
gFailCount++;
goto exit;
}
if( (error = clFinish(gQueue)) )
{
vlog_error( "Failure in clFinish\n" );
gFailCount++;
goto exit;
}
uint64_t currentTime = ReadTime() - startTime;
if( currentTime < bestTime )
bestTime = currentTime;
doubleTime[ vectorSize ] += currentTime;
}
if( bestTime < min_double_time[ vectorSize ] )
min_double_time[ vectorSize ] = bestTime;
}
}
}
if( gWimpyMode )
{
vlog( "\tfloat: Wimp Passed\n" );
if( gTestDouble )
vlog( "\tdouble: Wimp Passed\n" );
}
else
{
vlog( "\tfloat Passed\n" );
if( gTestDouble )
vlog( "\tdouble Passed\n" );
}
if( gReportTimes )
{
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
"average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
"best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
if( gTestDouble )
{
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
"average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
"best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
}
}
exit:
//clean up
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
clReleaseKernel( kernels[ vectorSize ][addressSpace] );
clReleaseProgram( programs[ vectorSize ][addressSpace] );
if( gTestDouble )
{
clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
}
}
}
gTestCount++;
return error;
}