mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
- Make it a common parameter in harness using either '-w', '--wimpy' or 'CL_WIMPY_MODE' environment variable. - Remove all test specific wimpy variable. --------- Co-authored-by: Kévin Petit <kpet@free.fr>
2056 lines
70 KiB
C++
2056 lines
70 KiB
C++
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include "harness/compat.h"
|
|
#include "harness/kernelHelpers.h"
|
|
#include "harness/testHarness.h"
|
|
#include "harness/parseParameters.h"
|
|
|
|
#include <string.h>
|
|
|
|
#include <algorithm>
|
|
|
|
#include "cl_utils.h"
|
|
#include "tests.h"
|
|
|
|
#include <CL/cl_half.h>
|
|
|
|
typedef struct ComputeReferenceInfoF_
|
|
{
|
|
float *x;
|
|
cl_ushort *r;
|
|
f2h f;
|
|
cl_ulong i;
|
|
cl_uint lim;
|
|
cl_uint count;
|
|
} ComputeReferenceInfoF;
|
|
|
|
typedef struct ComputeReferenceInfoD_
|
|
{
|
|
double *x;
|
|
cl_ushort *r;
|
|
d2h f;
|
|
cl_ulong i;
|
|
cl_uint lim;
|
|
cl_uint count;
|
|
} ComputeReferenceInfoD;
|
|
|
|
typedef struct CheckResultInfoF_
|
|
{
|
|
const float *x;
|
|
const cl_ushort *r;
|
|
const cl_ushort *s;
|
|
f2h f;
|
|
const char *aspace;
|
|
cl_uint lim;
|
|
cl_uint count;
|
|
int vsz;
|
|
} CheckResultInfoF;
|
|
|
|
typedef struct CheckResultInfoD_
|
|
{
|
|
const double *x;
|
|
const cl_ushort *r;
|
|
const cl_ushort *s;
|
|
d2h f;
|
|
const char *aspace;
|
|
cl_uint lim;
|
|
cl_uint count;
|
|
int vsz;
|
|
} CheckResultInfoD;
|
|
|
|
static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
|
|
{
|
|
ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
|
|
cl_uint lim = cri->lim;
|
|
cl_uint count = cri->count;
|
|
cl_uint off = jid * count;
|
|
float *x = cri->x + off;
|
|
cl_ushort *r = cri->r + off;
|
|
f2h f = cri->f;
|
|
cl_ulong i = cri->i + off;
|
|
cl_uint j;
|
|
|
|
if (off + count > lim) count = lim - off;
|
|
|
|
for (j = 0; j < count; ++j)
|
|
{
|
|
x[j] = as_float((cl_uint)(i + j));
|
|
r[j] = f(x[j]);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static cl_int CheckF(cl_uint jid, cl_uint tid, void *userInfo)
|
|
{
|
|
CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
|
|
cl_uint lim = cri->lim;
|
|
cl_uint count = cri->count;
|
|
cl_uint off = jid * count;
|
|
const float *x = cri->x + off;
|
|
const cl_ushort *r = cri->r + off;
|
|
const cl_ushort *s = cri->s + off;
|
|
f2h f = cri->f;
|
|
cl_uint j;
|
|
cl_ushort correct2 = f(0.0f);
|
|
cl_ushort correct3 = f(-0.0f);
|
|
cl_int ret = 0;
|
|
|
|
if (off + count > lim) count = lim - off;
|
|
|
|
if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
|
|
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (s[j] == r[j]) continue;
|
|
|
|
// Pass any NaNs
|
|
if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
|
|
|
|
// retry per section 6.5.3.3
|
|
if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
|
|
continue;
|
|
|
|
// if reference result is subnormal, pass any zero
|
|
if (gIsEmbedded && IsHalfSubnormal(r[j])
|
|
&& (s[j] == 0x0000 || s[j] == 0x8000))
|
|
continue;
|
|
|
|
vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, "
|
|
"vector_size = %d, address_space = %s\n",
|
|
j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
|
|
|
|
ret = 1;
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
|
|
{
|
|
ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
|
|
cl_uint lim = cri->lim;
|
|
cl_uint count = cri->count;
|
|
cl_uint off = jid * count;
|
|
double *x = cri->x + off;
|
|
cl_ushort *r = cri->r + off;
|
|
d2h f = cri->f;
|
|
cl_uint j;
|
|
cl_ulong i = cri->i + off;
|
|
|
|
if (off + count > lim) count = lim - off;
|
|
|
|
for (j = 0; j < count; ++j)
|
|
{
|
|
x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
|
|
r[j] = f(x[j]);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static cl_int CheckD(cl_uint jid, cl_uint tid, void *userInfo)
|
|
{
|
|
CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
|
|
cl_uint lim = cri->lim;
|
|
cl_uint count = cri->count;
|
|
cl_uint off = jid * count;
|
|
const double *x = cri->x + off;
|
|
const cl_ushort *r = cri->r + off;
|
|
const cl_ushort *s = cri->s + off;
|
|
d2h f = cri->f;
|
|
cl_uint j;
|
|
cl_ushort correct2 = f(0.0);
|
|
cl_ushort correct3 = f(-0.0);
|
|
cl_int ret = 0;
|
|
|
|
if (off + count > lim) count = lim - off;
|
|
|
|
if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
|
|
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (s[j] == r[j]) continue;
|
|
|
|
// Pass any NaNs
|
|
if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
|
|
|
|
if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
|
|
continue;
|
|
|
|
// if reference result is subnormal, pass any zero result
|
|
if (gIsEmbedded && IsHalfSubnormal(r[j])
|
|
&& (s[j] == 0x0000 || s[j] == 0x8000))
|
|
continue;
|
|
|
|
vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, "
|
|
"vector_size = %d, address space = %s (double precision)\n",
|
|
j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
|
|
|
|
ret = 1;
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static cl_half float2half_rte(float f)
|
|
{
|
|
return cl_half_from_float(f, CL_HALF_RTE);
|
|
}
|
|
|
|
static cl_half float2half_rtz(float f)
|
|
{
|
|
return cl_half_from_float(f, CL_HALF_RTZ);
|
|
}
|
|
|
|
static cl_half float2half_rtp(float f)
|
|
{
|
|
return cl_half_from_float(f, CL_HALF_RTP);
|
|
}
|
|
|
|
static cl_half float2half_rtn(float f)
|
|
{
|
|
return cl_half_from_float(f, CL_HALF_RTN);
|
|
}
|
|
|
|
static cl_half double2half_rte(double f)
|
|
{
|
|
return cl_half_from_double(f, CL_HALF_RTE);
|
|
}
|
|
|
|
static cl_half double2half_rtz(double f)
|
|
{
|
|
return cl_half_from_double(f, CL_HALF_RTZ);
|
|
}
|
|
|
|
static cl_half double2half_rtp(double f)
|
|
{
|
|
return cl_half_from_double(f, CL_HALF_RTP);
|
|
}
|
|
|
|
static cl_half double2half_rtn(double f)
|
|
{
|
|
return cl_half_from_double(f, CL_HALF_RTN);
|
|
}
|
|
|
|
REGISTER_TEST(vstore_half)
|
|
{
|
|
switch (get_default_rounding_mode(device))
|
|
{
|
|
case CL_FP_ROUND_TO_ZERO:
|
|
return Test_vStoreHalf_private(device, float2half_rtz,
|
|
double2half_rte, "");
|
|
case 0: return -1;
|
|
default:
|
|
return Test_vStoreHalf_private(device, float2half_rte,
|
|
double2half_rte, "");
|
|
}
|
|
}
|
|
|
|
REGISTER_TEST(vstore_half_rte)
|
|
{
|
|
return Test_vStoreHalf_private(device, float2half_rte, double2half_rte,
|
|
"_rte");
|
|
}
|
|
|
|
REGISTER_TEST(vstore_half_rtz)
|
|
{
|
|
return Test_vStoreHalf_private(device, float2half_rtz, double2half_rtz,
|
|
"_rtz");
|
|
}
|
|
|
|
REGISTER_TEST(vstore_half_rtp)
|
|
{
|
|
return Test_vStoreHalf_private(device, float2half_rtp, double2half_rtp,
|
|
"_rtp");
|
|
}
|
|
|
|
REGISTER_TEST(vstore_half_rtn)
|
|
{
|
|
return Test_vStoreHalf_private(device, float2half_rtn, double2half_rtn,
|
|
"_rtn");
|
|
}
|
|
|
|
REGISTER_TEST(vstorea_half)
|
|
{
|
|
switch (get_default_rounding_mode(device))
|
|
{
|
|
case CL_FP_ROUND_TO_ZERO:
|
|
return Test_vStoreaHalf_private(device, float2half_rtz,
|
|
double2half_rte, "");
|
|
case 0: return -1;
|
|
default:
|
|
return Test_vStoreaHalf_private(device, float2half_rte,
|
|
double2half_rte, "");
|
|
}
|
|
}
|
|
|
|
REGISTER_TEST(vstorea_half_rte)
|
|
{
|
|
return Test_vStoreaHalf_private(device, float2half_rte, double2half_rte,
|
|
"_rte");
|
|
}
|
|
|
|
REGISTER_TEST(vstorea_half_rtz)
|
|
{
|
|
return Test_vStoreaHalf_private(device, float2half_rtz, double2half_rtz,
|
|
"_rtz");
|
|
}
|
|
|
|
REGISTER_TEST(vstorea_half_rtp)
|
|
{
|
|
return Test_vStoreaHalf_private(device, float2half_rtp, double2half_rtp,
|
|
"_rtp");
|
|
}
|
|
|
|
REGISTER_TEST(vstorea_half_rtn)
|
|
{
|
|
return Test_vStoreaHalf_private(device, float2half_rtn, double2half_rtn,
|
|
"_rtn");
|
|
}
|
|
|
|
#pragma mark -
|
|
|
|
int Test_vStoreHalf_private(cl_device_id device, f2h referenceFunc,
|
|
d2h doubleReferenceFunc, const char *roundName)
|
|
{
|
|
int vectorSize, error;
|
|
cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
cl_program resetProgram = nullptr;
|
|
cl_kernel resetKernel = nullptr;
|
|
|
|
uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
|
|
uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
|
|
memset(min_time, -1, sizeof(min_time));
|
|
cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
|
|
uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
|
|
0
|
|
};
|
|
memset(min_double_time, -1, sizeof(min_double_time));
|
|
|
|
bool aligned = false;
|
|
|
|
for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
const char *source[] = { "__kernel void test( __global float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" vstore_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], i, f );\n"
|
|
"}\n" };
|
|
|
|
const char *source_v3[] = {
|
|
"__kernel void test( __global float *p, __global half *f,\n"
|
|
" uint extra_last_thread)\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t last_i = get_global_size(0)-1;\n"
|
|
" size_t adjust = 0;\n"
|
|
" if(last_i == i && extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" } "
|
|
" vstore_half3",
|
|
roundName,
|
|
"( vload3(i, p-adjust), i, f-adjust );\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *source_private_store[] = {
|
|
"__kernel void test( __global float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __private ushort data[16];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t offset = 0;\n"
|
|
" size_t vecsize = vec_step(p[i]);\n"
|
|
" vstore_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], 0, (__private half *)(&data[0]) );\n"
|
|
" for(offset = 0; offset < vecsize; offset++)\n"
|
|
" {\n"
|
|
" vstore_half(vload_half(offset, (__private half *)data), 0, "
|
|
"&f[vecsize*i+offset]);\n"
|
|
" }\n"
|
|
"}\n"
|
|
};
|
|
|
|
|
|
const char *source_private_store_v3[] = {
|
|
"__kernel void test( __global float *p, __global half *f,\n"
|
|
" uint extra_last_thread )\n"
|
|
"{\n"
|
|
" __private ushort data[4];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t last_i = get_global_size(0)-1;\n"
|
|
" size_t adjust = 0;\n"
|
|
" size_t offset = 0;\n"
|
|
" if(last_i == i && extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" } "
|
|
" vstore_half3",
|
|
roundName,
|
|
"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
|
|
" for(offset = 0; offset < 3; offset++)\n"
|
|
" {\n"
|
|
" vstore_half(vload_half(offset, (__private half *) data), "
|
|
"0, &f[3*i+offset-adjust]);\n"
|
|
" }\n"
|
|
"}\n"
|
|
};
|
|
|
|
char local_buf_size[10];
|
|
sprintf(local_buf_size, "%zu", gWorkGroupSize);
|
|
|
|
|
|
const char *source_local_store[] = {
|
|
"__kernel void test( __global float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __local ushort data[16*",
|
|
local_buf_size,
|
|
"];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" size_t lsize = get_local_size(0);\n"
|
|
" size_t vecsize = vec_step(p[0]);\n"
|
|
" event_t async_event;\n"
|
|
" vstore_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], lid, (__local half *)(&data[0]) );\n"
|
|
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
|
|
" async_event = async_work_group_copy((__global ushort "
|
|
"*)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, "
|
|
"0);\n" // investigate later
|
|
" wait_group_events(1, &async_event);\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *source_local_store_v3[] = {
|
|
"__kernel void test( __global float *p, __global half *f,\n"
|
|
" uint extra_last_thread )\n"
|
|
"{\n"
|
|
" __local ushort data[3*(",
|
|
local_buf_size,
|
|
"+1)];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" size_t last_i = get_global_size(0)-1;\n"
|
|
" size_t adjust = 0;\n"
|
|
" size_t lsize = get_local_size(0);\n"
|
|
" event_t async_event;\n"
|
|
" if(last_i == i && extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" } "
|
|
" vstore_half3",
|
|
roundName,
|
|
"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
|
|
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
|
|
" if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
|
|
" extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" }\n"
|
|
" async_event = async_work_group_copy(\n"
|
|
" (__global ushort*)(f+3*(i-lid)),\n"
|
|
" (__local ushort *)(&data[adjust]),\n"
|
|
" lsize*3-adjust, 0);\n" // investigate later
|
|
" wait_group_events(1, &async_event);\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" vstore_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], i, f );\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_private_store[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __private ushort data[16];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t offset = 0;\n"
|
|
" size_t vecsize = vec_step(p[i]);\n"
|
|
" vstore_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], 0, (__private half *)(&data[0]) );\n"
|
|
" for(offset = 0; offset < vecsize; offset++)\n"
|
|
" {\n"
|
|
" vstore_half(vload_half(offset, (__private half *)data), 0, "
|
|
"&f[vecsize*i+offset]);\n"
|
|
" }\n"
|
|
"}\n"
|
|
};
|
|
|
|
|
|
const char *double_source_local_store[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __local ushort data[16*",
|
|
local_buf_size,
|
|
"];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" size_t vecsize = vec_step(p[0]);\n"
|
|
" size_t lsize = get_local_size(0);\n"
|
|
" event_t async_event;\n"
|
|
" vstore_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], lid, (__local half *)(&data[0]) );\n"
|
|
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
|
|
" async_event = async_work_group_copy((__global ushort "
|
|
"*)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), "
|
|
"vecsize*lsize, 0);\n" // investigate later
|
|
" wait_group_events(1, &async_event);\n"
|
|
"}\n"
|
|
};
|
|
|
|
|
|
const char *double_source_v3[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double *p, __global half *f ,\n"
|
|
" uint extra_last_thread)\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t last_i = get_global_size(0)-1;\n"
|
|
" size_t adjust = 0;\n"
|
|
" if(last_i == i && extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" } "
|
|
" vstore_half3",
|
|
roundName,
|
|
"( vload3(i,p-adjust), i, f -adjust);\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_private_store_v3[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double *p, __global half *f,\n"
|
|
" uint extra_last_thread )\n"
|
|
"{\n"
|
|
" __private ushort data[4];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t last_i = get_global_size(0)-1;\n"
|
|
" size_t adjust = 0;\n"
|
|
" size_t offset = 0;\n"
|
|
" if(last_i == i && extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" } "
|
|
" vstore_half3",
|
|
roundName,
|
|
"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
|
|
" for(offset = 0; offset < 3; offset++)\n"
|
|
" {\n"
|
|
" vstore_half(vload_half(offset, (__private half *)data), 0, "
|
|
"&f[3*i+offset-adjust]);\n"
|
|
" }\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_local_store_v3[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double *p, __global half *f,\n"
|
|
" uint extra_last_thread )\n"
|
|
"{\n"
|
|
" __local ushort data[3*(",
|
|
local_buf_size,
|
|
"+1)];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" size_t last_i = get_global_size(0)-1;\n"
|
|
" size_t adjust = 0;\n"
|
|
" size_t lsize = get_local_size(0);\n"
|
|
" event_t async_event;\n"
|
|
" if(last_i == i && extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" }\n "
|
|
" vstore_half3",
|
|
roundName,
|
|
"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
|
|
" barrier( CLK_LOCAL_MEM_FENCE ); \n"
|
|
" if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
|
|
" extra_last_thread != 0) {\n"
|
|
" adjust = 3-extra_last_thread;\n"
|
|
" }\n"
|
|
" async_event = async_work_group_copy(\n"
|
|
" (__global ushort *)(f+3*(i-lid)),\n"
|
|
" (__local ushort *)(&data[adjust]),\n"
|
|
" lsize*3-adjust, 0);\n" // investigate later
|
|
" wait_group_events(1, &async_event);\n"
|
|
"}\n"
|
|
};
|
|
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
programs[vectorSize][0] = MakeProgram(
|
|
device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
|
|
}
|
|
else
|
|
{
|
|
programs[vectorSize][0] =
|
|
MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
|
|
}
|
|
if (NULL == programs[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
|
|
kernels[vectorSize][0] =
|
|
clCreateKernel(programs[vectorSize][0], "test", &error);
|
|
if (NULL == kernels[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
programs[vectorSize][1] =
|
|
MakeProgram(device, source_private_store_v3,
|
|
sizeof(source_private_store_v3)
|
|
/ sizeof(source_private_store_v3[0]));
|
|
}
|
|
else
|
|
{
|
|
programs[vectorSize][1] = MakeProgram(
|
|
device, source_private_store,
|
|
sizeof(source_private_store) / sizeof(source_private_store[0]));
|
|
}
|
|
if (NULL == programs[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
|
|
kernels[vectorSize][1] =
|
|
clCreateKernel(programs[vectorSize][1], "test", &error);
|
|
if (NULL == kernels[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
programs[vectorSize][2] =
|
|
MakeProgram(device, source_local_store_v3,
|
|
sizeof(source_local_store_v3)
|
|
/ sizeof(source_local_store_v3[0]));
|
|
if (NULL == programs[vectorSize][2])
|
|
{
|
|
unsigned q;
|
|
for (q = 0; q < sizeof(source_local_store_v3)
|
|
/ sizeof(source_local_store_v3[0]);
|
|
q++)
|
|
vlog_error("%s", source_local_store_v3[q]);
|
|
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
programs[vectorSize][2] = MakeProgram(
|
|
device, source_local_store,
|
|
sizeof(source_local_store) / sizeof(source_local_store[0]));
|
|
if (NULL == programs[vectorSize][2])
|
|
{
|
|
unsigned q;
|
|
for (q = 0; q < sizeof(source_local_store)
|
|
/ sizeof(source_local_store[0]);
|
|
q++)
|
|
vlog_error("%s", source_local_store[q]);
|
|
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
kernels[vectorSize][2] =
|
|
clCreateKernel(programs[vectorSize][2], "test", &error);
|
|
if (NULL == kernels[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
doublePrograms[vectorSize][0] = MakeProgram(
|
|
device, double_source_v3,
|
|
sizeof(double_source_v3) / sizeof(double_source_v3[0]));
|
|
}
|
|
else
|
|
{
|
|
doublePrograms[vectorSize][0] = MakeProgram(
|
|
device, double_source,
|
|
sizeof(double_source) / sizeof(double_source[0]));
|
|
}
|
|
if (NULL == doublePrograms[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
|
|
doubleKernels[vectorSize][0] =
|
|
clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
|
|
if (NULL == kernels[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
vlog_error(
|
|
"\t\tFAILED -- Failed to create double kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
doublePrograms[vectorSize][1] = MakeProgram(
|
|
device, double_source_private_store_v3,
|
|
sizeof(double_source_private_store_v3)
|
|
/ sizeof(double_source_private_store_v3[0]));
|
|
else
|
|
doublePrograms[vectorSize][1] =
|
|
MakeProgram(device, double_source_private_store,
|
|
sizeof(double_source_private_store)
|
|
/ sizeof(double_source_private_store[0]));
|
|
|
|
if (NULL == doublePrograms[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
|
|
doubleKernels[vectorSize][1] =
|
|
clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
|
|
if (NULL == kernels[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create double private "
|
|
"kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
doublePrograms[vectorSize][2] =
|
|
MakeProgram(device, double_source_local_store_v3,
|
|
sizeof(double_source_local_store_v3)
|
|
/ sizeof(double_source_local_store_v3[0]));
|
|
}
|
|
else
|
|
{
|
|
doublePrograms[vectorSize][2] =
|
|
MakeProgram(device, double_source_local_store,
|
|
sizeof(double_source_local_store)
|
|
/ sizeof(double_source_local_store[0]));
|
|
}
|
|
if (NULL == doublePrograms[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
|
|
doubleKernels[vectorSize][2] =
|
|
clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
|
|
if (NULL == kernels[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create double local "
|
|
"kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
}
|
|
} // end for vector size
|
|
|
|
const char *reset[] = {
|
|
"__kernel void reset( __global float *p, __global ushort *f,\n"
|
|
" uint extra_last_thread)\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" *(f + i) = 0xdead;"
|
|
"}\n"
|
|
};
|
|
|
|
if (!gHostReset)
|
|
{
|
|
resetProgram =
|
|
MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
|
|
if (NULL == resetProgram)
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
resetKernel = clCreateKernel(resetProgram, "reset", &error);
|
|
if (NULL == resetKernel)
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// Figure out how many elements are in a work block
|
|
size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
|
|
size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
|
|
uint64_t lastCase = 1ULL << (8 * sizeof(float)); // number of floats.
|
|
size_t stride = blockCount;
|
|
|
|
if (gWimpyMode)
|
|
stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
|
|
|
|
// we handle 64-bit types a bit differently.
|
|
if (lastCase == 0) lastCase = 0x100000000ULL;
|
|
|
|
uint64_t i, j;
|
|
error = 0;
|
|
uint64_t printMask = (lastCase >> 4) - 1;
|
|
cl_uint count = 0;
|
|
int addressSpace;
|
|
size_t loopCount;
|
|
cl_uint threadCount = GetThreadCount();
|
|
|
|
ComputeReferenceInfoF fref;
|
|
fref.x = (float *)gIn_single;
|
|
fref.r = (cl_half *)gOut_half_reference;
|
|
fref.f = referenceFunc;
|
|
fref.lim = blockCount;
|
|
fref.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
CheckResultInfoF fchk;
|
|
fchk.x = (const float *)gIn_single;
|
|
fchk.r = (const cl_half *)gOut_half_reference;
|
|
fchk.s = (const cl_half *)gOut_half;
|
|
fchk.f = referenceFunc;
|
|
fchk.lim = blockCount;
|
|
fchk.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
ComputeReferenceInfoD dref;
|
|
dref.x = (double *)gIn_double;
|
|
dref.r = (cl_half *)gOut_half_reference_double;
|
|
dref.f = doubleReferenceFunc;
|
|
dref.lim = blockCount;
|
|
dref.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
CheckResultInfoD dchk;
|
|
dchk.x = (const double *)gIn_double;
|
|
dchk.r = (const cl_half *)gOut_half_reference_double;
|
|
dchk.s = (const cl_half *)gOut_half;
|
|
dchk.f = doubleReferenceFunc;
|
|
dchk.lim = blockCount;
|
|
dchk.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
for (i = 0; i < lastCase; i += stride)
|
|
{
|
|
count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
|
|
fref.i = i;
|
|
dref.i = i;
|
|
|
|
// Compute the input and reference
|
|
ThreadPool_Do(ReferenceF, threadCount, &fref);
|
|
|
|
error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
|
|
count * sizeof(float), gIn_single, 0, NULL,
|
|
NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteBuffer\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
ThreadPool_Do(ReferenceD, threadCount, &dref);
|
|
|
|
error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
|
|
count * sizeof(double), gIn_double, 0,
|
|
NULL, NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteBuffer\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
// Loop through vector sizes
|
|
fchk.vsz = g_arrVecSizes[vectorSize];
|
|
dchk.vsz = g_arrVecSizes[vectorSize];
|
|
|
|
for (addressSpace = 0; addressSpace < 3; addressSpace++)
|
|
{
|
|
// Loop over address spaces
|
|
fchk.aspace = addressSpaceNames[addressSpace];
|
|
dchk.aspace = addressSpaceNames[addressSpace];
|
|
|
|
if (!gHostReset)
|
|
{
|
|
error = RunKernel(device, resetKernel, gInBuffer_single,
|
|
gOutBuffer_half, count, 0);
|
|
}
|
|
else
|
|
{
|
|
cl_uint pattern = 0xdeaddead;
|
|
memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
|
|
|
|
error = clEnqueueWriteBuffer(
|
|
gQueue, gOutBuffer_half, CL_FALSE, 0,
|
|
count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
|
|
}
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = RunKernel(device, kernels[vectorSize][addressSpace],
|
|
gInBuffer_single, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned));
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
|
|
count * sizeof(cl_half), gOut_half,
|
|
0, NULL, NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clReadArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = ThreadPool_Do(CheckF, threadCount, &fchk);
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
|
|
if (!gHostReset)
|
|
{
|
|
error = RunKernel(device, resetKernel, gInBuffer_double,
|
|
gOutBuffer_half, count, 0);
|
|
}
|
|
else
|
|
{
|
|
cl_uint pattern = 0xdeaddead;
|
|
memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
|
|
|
|
error = clEnqueueWriteBuffer(
|
|
gQueue, gOutBuffer_half, CL_FALSE, 0,
|
|
count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
|
|
}
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = RunKernel(device,
|
|
doubleKernels[vectorSize][addressSpace],
|
|
gInBuffer_double, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned));
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = clEnqueueReadBuffer(
|
|
gQueue, gOutBuffer_half, CL_TRUE, 0,
|
|
count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clReadArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
|
|
error = ThreadPool_Do(CheckD, threadCount, &dchk);
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (((i + blockCount) & ~printMask) == (i + blockCount))
|
|
{
|
|
vlog(".");
|
|
fflush(stdout);
|
|
}
|
|
} // end last case
|
|
|
|
loopCount = count == blockCount ? 1 : 100;
|
|
if (gReportTimes)
|
|
{
|
|
// Init the input stream
|
|
cl_float *p = (cl_float *)gIn_single;
|
|
for (j = 0; j < count; j++)
|
|
p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
|
|
|
|
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
|
|
count * sizeof(float), gIn_single, 0,
|
|
NULL, NULL)))
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
// Init the input stream
|
|
cl_double *q = (cl_double *)gIn_double;
|
|
for (j = 0; j < count; j++)
|
|
q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
|
|
|
|
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
|
|
0, count * sizeof(double),
|
|
gIn_double, 0, NULL, NULL)))
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
// Run again for timing
|
|
for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
uint64_t bestTime = -1ULL;
|
|
for (j = 0; j < loopCount; j++)
|
|
{
|
|
uint64_t startTime = ReadTime();
|
|
|
|
|
|
if ((error = RunKernel(device, kernels[vectorSize][0],
|
|
gInBuffer_single, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned))))
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if ((error = clFinish(gQueue)))
|
|
{
|
|
vlog_error("Failure in clFinish\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
uint64_t currentTime = ReadTime() - startTime;
|
|
if (currentTime < bestTime) bestTime = currentTime;
|
|
time[vectorSize] += currentTime;
|
|
}
|
|
if (bestTime < min_time[vectorSize])
|
|
min_time[vectorSize] = bestTime;
|
|
|
|
if (gTestDouble)
|
|
{
|
|
bestTime = -1ULL;
|
|
for (j = 0; j < loopCount; j++)
|
|
{
|
|
uint64_t startTime = ReadTime();
|
|
if ((error =
|
|
RunKernel(device, doubleKernels[vectorSize][0],
|
|
gInBuffer_double, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned))))
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if ((error = clFinish(gQueue)))
|
|
{
|
|
vlog_error("Failure in clFinish\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
uint64_t currentTime = ReadTime() - startTime;
|
|
if (currentTime < bestTime) bestTime = currentTime;
|
|
doubleTime[vectorSize] += currentTime;
|
|
}
|
|
if (bestTime < min_double_time[vectorSize])
|
|
min_double_time[vectorSize] = bestTime;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (gReportTimes)
|
|
{
|
|
for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
|
|
* gComputeDevices / (double)(count * loopCount),
|
|
0, "average us/elem",
|
|
"vStoreHalf%s avg. (%s vector size: %d)", roundName,
|
|
addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
|
|
for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
|
|
* gDeviceFrequency * gComputeDevices / (double)count,
|
|
0, "best us/elem",
|
|
"vStoreHalf%s best (%s vector size: %d)", roundName,
|
|
addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
|
|
if (gTestDouble)
|
|
{
|
|
for (vectorSize = kMinVectorSize;
|
|
vectorSize < kLastVectorSizeToTest; vectorSize++)
|
|
vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
|
|
* gDeviceFrequency * gComputeDevices
|
|
/ (double)(count * loopCount),
|
|
0, "average us/elem (double)",
|
|
"vStoreHalf%s avg. d (%s vector size: %d)", roundName,
|
|
addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
|
|
for (vectorSize = kMinVectorSize;
|
|
vectorSize < kLastVectorSizeToTest; vectorSize++)
|
|
vlog_perf(SubtractTime(min_double_time[vectorSize], 0) * 1e6
|
|
* gDeviceFrequency * gComputeDevices
|
|
/ (double)count,
|
|
0, "best us/elem (double)",
|
|
"vStoreHalf%s best d (%s vector size: %d)", roundName,
|
|
addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
|
|
}
|
|
}
|
|
|
|
exit:
|
|
// clean up
|
|
if (!gHostReset)
|
|
{
|
|
clReleaseKernel(resetKernel);
|
|
clReleaseProgram(resetProgram);
|
|
}
|
|
|
|
for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
for (addressSpace = 0; addressSpace < 3; addressSpace++)
|
|
{
|
|
clReleaseKernel(kernels[vectorSize][addressSpace]);
|
|
clReleaseProgram(programs[vectorSize][addressSpace]);
|
|
if (gTestDouble)
|
|
{
|
|
clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
|
|
clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
|
|
}
|
|
}
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
int Test_vStoreaHalf_private(cl_device_id device, f2h referenceFunc,
|
|
d2h doubleReferenceFunc, const char *roundName)
|
|
{
|
|
int vectorSize, error;
|
|
cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
cl_program resetProgram = nullptr;
|
|
cl_kernel resetKernel = nullptr;
|
|
|
|
uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
|
|
uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
|
|
memset(min_time, -1, sizeof(min_time));
|
|
cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
|
|
uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
|
|
uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
|
|
0
|
|
};
|
|
memset(min_double_time, -1, sizeof(min_double_time));
|
|
|
|
bool aligned = true;
|
|
|
|
int minVectorSize = kMinVectorSize;
|
|
// There is no aligned scalar vstorea_half
|
|
if (0 == minVectorSize) minVectorSize = 1;
|
|
|
|
// Loop over vector sizes
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
const char *source[] = { "__kernel void test( __global float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], i, f );\n"
|
|
"}\n" };
|
|
|
|
const char *source_v3[] = {
|
|
"__kernel void test( __global float3 *p, __global half *f )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" vstorea_half3",
|
|
roundName,
|
|
"( p[i], i, f );\n"
|
|
" vstore_half",
|
|
roundName,
|
|
"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *source_private[] = {
|
|
"__kernel void test( __global float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __private float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data;\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" data = p[i];\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( data, i, f );\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *source_private_v3[] = {
|
|
"__kernel void test( __global float3 *p, __global half *f )\n"
|
|
"{\n"
|
|
" __private float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data;\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" data = p[i];\n"
|
|
" vstorea_half3",
|
|
roundName,
|
|
"( data, i, f );\n"
|
|
" vstore_half",
|
|
roundName,
|
|
"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
|
|
"}\n"
|
|
};
|
|
|
|
char local_buf_size[10];
|
|
sprintf(local_buf_size, "%zu", gWorkGroupSize);
|
|
const char *source_local[] = { "__kernel void test( __global float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __local float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data[",
|
|
local_buf_size,
|
|
"];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" data[lid] = p[i];\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( data[lid], i, f );\n"
|
|
"}\n" };
|
|
|
|
const char *source_local_v3[] = {
|
|
"__kernel void test( __global float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __local float",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data[",
|
|
local_buf_size,
|
|
"];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" data[lid] = p[i];\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( data[lid], i, f );\n"
|
|
" vstore_half",
|
|
roundName,
|
|
"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], i, f );\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_v3[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( p[i], i, f );\n"
|
|
" vstore_half",
|
|
roundName,
|
|
"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_private[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __private double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data;\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" data = p[i];\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( data, i, f );\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_private_v3[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __private double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data;\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" data = p[i];\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( data, i, f );\n"
|
|
" vstore_half",
|
|
roundName,
|
|
"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_local[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __local double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data[",
|
|
local_buf_size,
|
|
"];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" data[lid] = p[i];\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( data[lid], i, f );\n"
|
|
"}\n"
|
|
};
|
|
|
|
const char *double_source_local_v3[] = {
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void test( __global double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" *p, __global half *f )\n"
|
|
"{\n"
|
|
" __local double",
|
|
vector_size_name_extensions[vectorSize],
|
|
" data[",
|
|
local_buf_size,
|
|
"];\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" size_t lid = get_local_id(0);\n"
|
|
" data[lid] = p[i];\n"
|
|
" vstorea_half",
|
|
vector_size_name_extensions[vectorSize],
|
|
roundName,
|
|
"( data[lid], i, f );\n"
|
|
" vstore_half",
|
|
roundName,
|
|
"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
|
|
"}\n"
|
|
};
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
programs[vectorSize][0] = MakeProgram(
|
|
device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
|
|
if (NULL == programs[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
programs[vectorSize][0] =
|
|
MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
|
|
if (NULL == programs[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
kernels[vectorSize][0] =
|
|
clCreateKernel(programs[vectorSize][0], "test", &error);
|
|
if (NULL == kernels[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
programs[vectorSize][1] = MakeProgram(
|
|
device, source_private_v3,
|
|
sizeof(source_private_v3) / sizeof(source_private_v3[0]));
|
|
if (NULL == programs[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
programs[vectorSize][1] =
|
|
MakeProgram(device, source_private,
|
|
sizeof(source_private) / sizeof(source_private[0]));
|
|
if (NULL == programs[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
kernels[vectorSize][1] =
|
|
clCreateKernel(programs[vectorSize][1], "test", &error);
|
|
if (NULL == kernels[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
programs[vectorSize][2] = MakeProgram(
|
|
device, source_local_v3,
|
|
sizeof(source_local_v3) / sizeof(source_local_v3[0]));
|
|
if (NULL == programs[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
programs[vectorSize][2] =
|
|
MakeProgram(device, source_local,
|
|
sizeof(source_local) / sizeof(source_local[0]));
|
|
if (NULL == programs[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
kernels[vectorSize][2] =
|
|
clCreateKernel(programs[vectorSize][2], "test", &error);
|
|
if (NULL == kernels[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
doublePrograms[vectorSize][0] = MakeProgram(
|
|
device, double_source_v3,
|
|
sizeof(double_source_v3) / sizeof(double_source_v3[0]));
|
|
if (NULL == doublePrograms[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
doublePrograms[vectorSize][0] = MakeProgram(
|
|
device, double_source,
|
|
sizeof(double_source) / sizeof(double_source[0]));
|
|
if (NULL == doublePrograms[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
doubleKernels[vectorSize][0] =
|
|
clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
|
|
if (NULL == kernels[vectorSize][0])
|
|
{
|
|
gFailCount++;
|
|
vlog_error(
|
|
"\t\tFAILED -- Failed to create double kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
doublePrograms[vectorSize][1] =
|
|
MakeProgram(device, double_source_private_v3,
|
|
sizeof(double_source_private_v3)
|
|
/ sizeof(double_source_private_v3[0]));
|
|
if (NULL == doublePrograms[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
doublePrograms[vectorSize][1] =
|
|
MakeProgram(device, double_source_private,
|
|
sizeof(double_source_private)
|
|
/ sizeof(double_source_private[0]));
|
|
if (NULL == doublePrograms[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
doubleKernels[vectorSize][1] =
|
|
clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
|
|
if (NULL == kernels[vectorSize][1])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create double private "
|
|
"kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
|
|
if (g_arrVecSizes[vectorSize] == 3)
|
|
{
|
|
doublePrograms[vectorSize][2] =
|
|
MakeProgram(device, double_source_local_v3,
|
|
sizeof(double_source_local_v3)
|
|
/ sizeof(double_source_local_v3[0]));
|
|
if (NULL == doublePrograms[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
doublePrograms[vectorSize][2] =
|
|
MakeProgram(device, double_source_local,
|
|
sizeof(double_source_local)
|
|
/ sizeof(double_source_local[0]));
|
|
if (NULL == doublePrograms[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
doubleKernels[vectorSize][2] =
|
|
clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
|
|
if (NULL == kernels[vectorSize][2])
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create double local "
|
|
"kernel. (%d)\n",
|
|
error);
|
|
return error;
|
|
}
|
|
}
|
|
}
|
|
|
|
const char *reset[] = {
|
|
"__kernel void reset( __global float *p, __global ushort *f,\n"
|
|
" uint extra_last_thread)\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" *(f + i) = 0xdead;"
|
|
"}\n"
|
|
};
|
|
|
|
if (!gHostReset)
|
|
{
|
|
resetProgram =
|
|
MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
|
|
if (NULL == resetProgram)
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
resetKernel = clCreateKernel(resetProgram, "reset", &error);
|
|
if (NULL == resetKernel)
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// Figure out how many elements are in a work block
|
|
size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
|
|
size_t blockCount = BUFFER_SIZE / elementSize;
|
|
uint64_t lastCase = 1ULL << (8 * sizeof(float));
|
|
size_t stride = blockCount;
|
|
|
|
if (gWimpyMode)
|
|
stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
|
|
|
|
// we handle 64-bit types a bit differently.
|
|
if (lastCase == 0) lastCase = 0x100000000ULL;
|
|
uint64_t i, j;
|
|
error = 0;
|
|
uint64_t printMask = (lastCase >> 4) - 1;
|
|
cl_uint count = 0;
|
|
int addressSpace;
|
|
size_t loopCount;
|
|
cl_uint threadCount = GetThreadCount();
|
|
|
|
ComputeReferenceInfoF fref;
|
|
fref.x = (float *)gIn_single;
|
|
fref.r = (cl_half *)gOut_half_reference;
|
|
fref.f = referenceFunc;
|
|
fref.lim = blockCount;
|
|
fref.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
CheckResultInfoF fchk;
|
|
fchk.x = (const float *)gIn_single;
|
|
fchk.r = (const cl_half *)gOut_half_reference;
|
|
fchk.s = (const cl_half *)gOut_half;
|
|
fchk.f = referenceFunc;
|
|
fchk.lim = blockCount;
|
|
fchk.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
ComputeReferenceInfoD dref;
|
|
dref.x = (double *)gIn_double;
|
|
dref.r = (cl_half *)gOut_half_reference_double;
|
|
dref.f = doubleReferenceFunc;
|
|
dref.lim = blockCount;
|
|
dref.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
CheckResultInfoD dchk;
|
|
dchk.x = (const double *)gIn_double;
|
|
dchk.r = (const cl_half *)gOut_half_reference_double;
|
|
dchk.s = (const cl_half *)gOut_half;
|
|
dchk.f = doubleReferenceFunc;
|
|
dchk.lim = blockCount;
|
|
dchk.count = (blockCount + threadCount - 1) / threadCount;
|
|
|
|
for (i = 0; i < (uint64_t)lastCase; i += stride)
|
|
{
|
|
count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
|
|
fref.i = i;
|
|
dref.i = i;
|
|
|
|
// Create the input and reference
|
|
ThreadPool_Do(ReferenceF, threadCount, &fref);
|
|
|
|
error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
|
|
count * sizeof(float), gIn_single, 0, NULL,
|
|
NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
ThreadPool_Do(ReferenceD, threadCount, &dref);
|
|
|
|
error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
|
|
count * sizeof(double), gIn_double, 0,
|
|
NULL, NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
// Loop over vector legths
|
|
fchk.vsz = g_arrVecSizes[vectorSize];
|
|
dchk.vsz = g_arrVecSizes[vectorSize];
|
|
|
|
for (addressSpace = 0; addressSpace < 3; addressSpace++)
|
|
{
|
|
// Loop over address spaces
|
|
fchk.aspace = addressSpaceNames[addressSpace];
|
|
dchk.aspace = addressSpaceNames[addressSpace];
|
|
|
|
if (!gHostReset)
|
|
{
|
|
error = RunKernel(device, resetKernel, gInBuffer_single,
|
|
gOutBuffer_half, count, 0);
|
|
}
|
|
else
|
|
{
|
|
cl_uint pattern = 0xdeaddead;
|
|
memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
|
|
|
|
error = clEnqueueWriteBuffer(
|
|
gQueue, gOutBuffer_half, CL_FALSE, 0,
|
|
count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
|
|
}
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = RunKernel(device, kernels[vectorSize][addressSpace],
|
|
gInBuffer_single, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned));
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
|
|
count * sizeof(cl_half), gOut_half,
|
|
0, NULL, NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clReadArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = ThreadPool_Do(CheckF, threadCount, &fchk);
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
|
|
if (!gHostReset)
|
|
{
|
|
error = RunKernel(device, resetKernel, gInBuffer_single,
|
|
gOutBuffer_half, count, 0);
|
|
}
|
|
else
|
|
{
|
|
cl_uint pattern = 0xdeaddead;
|
|
memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
|
|
|
|
error = clEnqueueWriteBuffer(
|
|
gQueue, gOutBuffer_half, CL_FALSE, 0,
|
|
count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
|
|
}
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = RunKernel(device,
|
|
doubleKernels[vectorSize][addressSpace],
|
|
gInBuffer_double, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned));
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = clEnqueueReadBuffer(
|
|
gQueue, gOutBuffer_half, CL_TRUE, 0,
|
|
count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
|
|
if (error)
|
|
{
|
|
vlog_error("Failure in clReadArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
error = ThreadPool_Do(CheckD, threadCount, &dchk);
|
|
if (error)
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
}
|
|
} // end for vector size
|
|
|
|
if (((i + blockCount) & ~printMask) == (i + blockCount))
|
|
{
|
|
vlog(".");
|
|
fflush(stdout);
|
|
}
|
|
} // for end lastcase
|
|
|
|
loopCount = count == blockCount ? 1 : 100;
|
|
if (gReportTimes)
|
|
{
|
|
// Init the input stream
|
|
cl_float *p = (cl_float *)gIn_single;
|
|
for (j = 0; j < count; j++)
|
|
p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
|
|
|
|
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
|
|
count * sizeof(float), gIn_single, 0,
|
|
NULL, NULL)))
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if (gTestDouble)
|
|
{
|
|
// Init the input stream
|
|
cl_double *q = (cl_double *)gIn_double;
|
|
for (j = 0; j < count; j++)
|
|
q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
|
|
|
|
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
|
|
0, count * sizeof(double),
|
|
gIn_double, 0, NULL, NULL)))
|
|
{
|
|
vlog_error("Failure in clWriteArray\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
// Run again for timing
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
uint64_t bestTime = -1ULL;
|
|
for (j = 0; j < loopCount; j++)
|
|
{
|
|
uint64_t startTime = ReadTime();
|
|
if ((error = RunKernel(device, kernels[vectorSize][0],
|
|
gInBuffer_single, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned))))
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if ((error = clFinish(gQueue)))
|
|
{
|
|
vlog_error("Failure in clFinish\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
uint64_t currentTime = ReadTime() - startTime;
|
|
if (currentTime < bestTime) bestTime = currentTime;
|
|
time[vectorSize] += currentTime;
|
|
}
|
|
if (bestTime < min_time[vectorSize])
|
|
min_time[vectorSize] = bestTime;
|
|
|
|
if (gTestDouble)
|
|
{
|
|
bestTime = -1ULL;
|
|
for (j = 0; j < loopCount; j++)
|
|
{
|
|
uint64_t startTime = ReadTime();
|
|
if ((error =
|
|
RunKernel(device, doubleKernels[vectorSize][0],
|
|
gInBuffer_double, gOutBuffer_half,
|
|
numVecs(count, vectorSize, aligned),
|
|
runsOverBy(count, vectorSize, aligned))))
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if ((error = clFinish(gQueue)))
|
|
{
|
|
vlog_error("Failure in clFinish\n");
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
uint64_t currentTime = ReadTime() - startTime;
|
|
if (currentTime < bestTime) bestTime = currentTime;
|
|
doubleTime[vectorSize] += currentTime;
|
|
}
|
|
if (bestTime < min_double_time[vectorSize])
|
|
min_double_time[vectorSize] = bestTime;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (gReportTimes)
|
|
{
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
|
|
* gComputeDevices / (double)(count * loopCount),
|
|
0, "average us/elem",
|
|
"vStoreaHalf%s avg. (%s vector size: %d)", roundName,
|
|
addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
|
|
* gDeviceFrequency * gComputeDevices / (double)count,
|
|
0, "best us/elem",
|
|
"vStoreaHalf%s best (%s vector size: %d)", roundName,
|
|
addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
|
|
if (gTestDouble)
|
|
{
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
|
|
* gDeviceFrequency * gComputeDevices
|
|
/ (double)(count * loopCount),
|
|
0, "average us/elem (double)",
|
|
"vStoreaHalf%s avg. d (%s vector size: %d)",
|
|
roundName, addressSpaceNames[0],
|
|
(g_arrVecSizes[vectorSize]));
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
vlog_perf(
|
|
SubtractTime(min_double_time[vectorSize], 0) * 1e6
|
|
* gDeviceFrequency * gComputeDevices / (double)count,
|
|
0, "best us/elem (double)",
|
|
"vStoreaHalf%s best d (%s vector size: %d)", roundName,
|
|
addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
|
|
}
|
|
}
|
|
|
|
exit:
|
|
// clean up
|
|
if (!gHostReset)
|
|
{
|
|
clReleaseKernel(resetKernel);
|
|
clReleaseProgram(resetProgram);
|
|
}
|
|
|
|
for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
|
|
vectorSize++)
|
|
{
|
|
for (addressSpace = 0; addressSpace < 3; addressSpace++)
|
|
{
|
|
clReleaseKernel(kernels[vectorSize][addressSpace]);
|
|
clReleaseProgram(programs[vectorSize][addressSpace]);
|
|
if (gTestDouble)
|
|
{
|
|
clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
|
|
clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
|
|
}
|
|
}
|
|
}
|
|
|
|
return error;
|
|
}
|