mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-26 08:49:02 +00:00
Apply clang-format on math_brute_force (#1104)
Signed-off-by: Marco Antognini <marco.antognini@arm.com>
This commit is contained in:
@@ -105,11 +105,14 @@
|
||||
extern const vtbl _unary; // float foo( float )
|
||||
extern const vtbl _unary_u; // float foo( uint ), double foo( ulong )
|
||||
extern const vtbl _i_unary; // int foo( float )
|
||||
extern const vtbl _macro_unary; // int foo( float ), returns {0,1} for scalar, { 0, -1 } for vector
|
||||
extern const vtbl _macro_unary; // int foo( float ), returns {0,1} for scalar,
|
||||
// { 0, -1 } for vector
|
||||
extern const vtbl _binary; // float foo( float, float )
|
||||
extern const vtbl _binary_nextafter; // float foo( float, float ), special handling for nextafter
|
||||
extern const vtbl _binary_nextafter; // float foo( float, float ), special
|
||||
// handling for nextafter
|
||||
extern const vtbl _binary_operator; // float .op. float
|
||||
extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for scalar, { 0, -1 } for vector
|
||||
extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for
|
||||
// scalar, { 0, -1 } for vector
|
||||
extern const vtbl _binary_i; // float foo( float, int )
|
||||
extern const vtbl _ternary; // float foo( float, float, float )
|
||||
extern const vtbl _unary_two_results; // float foo( float, float * )
|
||||
|
||||
@@ -30,8 +30,7 @@
|
||||
|
||||
#include "harness/mt19937.h"
|
||||
|
||||
typedef union fptr
|
||||
{
|
||||
typedef union fptr {
|
||||
void *p;
|
||||
double (*f_f)(double);
|
||||
double (*f_u)(cl_uint);
|
||||
@@ -48,8 +47,7 @@ typedef union fptr
|
||||
float (*f_fma)(float, float, float, int);
|
||||
} fptr;
|
||||
|
||||
typedef union dptr
|
||||
{
|
||||
typedef union dptr {
|
||||
void *p;
|
||||
long double (*f_f)(long double);
|
||||
long double (*f_u)(cl_ulong);
|
||||
@@ -77,7 +75,8 @@ typedef struct vtbl
|
||||
typedef struct Func
|
||||
{
|
||||
const char *name; // common name, to be used as an argument in the shell
|
||||
const char *nameInCode; // name as it appears in the __kernel, usually the same as name, but different for multiplication
|
||||
const char *nameInCode; // name as it appears in the __kernel, usually the
|
||||
// same as name, but different for multiplication
|
||||
fptr func;
|
||||
dptr dfunc;
|
||||
fptr rfunc;
|
||||
@@ -97,5 +96,3 @@ extern const Func functionList[];
|
||||
extern const size_t functionListCount;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -27,22 +27,19 @@
|
||||
io_object_t iterator;
|
||||
} sleepInfo;
|
||||
|
||||
void sleepCallback( void * refcon,
|
||||
io_service_t service,
|
||||
natural_t messageType,
|
||||
void sleepCallback(void* refcon, io_service_t service, natural_t messageType,
|
||||
void* messageArgument);
|
||||
|
||||
void sleepCallback( void * refcon UNUSED,
|
||||
io_service_t service UNUSED,
|
||||
natural_t messageType,
|
||||
void * messageArgument )
|
||||
void sleepCallback(void* refcon UNUSED, io_service_t service UNUSED,
|
||||
natural_t messageType, void* messageArgument)
|
||||
{
|
||||
|
||||
IOReturn result;
|
||||
/*
|
||||
service -- The IOService whose state has changed.
|
||||
messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the IOService's family.
|
||||
messageArgument -- An argument for the message, dependent on the messageType.
|
||||
messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the
|
||||
IOService's family. messageArgument -- An argument for the message,
|
||||
dependent on the messageType.
|
||||
*/
|
||||
switch (messageType)
|
||||
{
|
||||
@@ -59,7 +56,8 @@
|
||||
// this notification. Important: if you don’t respond, there will
|
||||
// be a 30-second timeout before the computer sleeps.
|
||||
// IOCancelPowerChange(root_port,(long)messageArgument);
|
||||
result = IOCancelPowerChange(sleepInfo.connection,(long)messageArgument);
|
||||
result = IOCancelPowerChange(sleepInfo.connection,
|
||||
(long)messageArgument);
|
||||
if (kIOReturnSuccess != result)
|
||||
vlog("sleep prevention failed. (%d)\n", result);
|
||||
break;
|
||||
@@ -71,17 +69,13 @@
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void PreventSleep(void)
|
||||
{
|
||||
#if defined(__APPLE__)
|
||||
vlog("Disabling sleep... ");
|
||||
sleepInfo.iterator = (io_object_t)0;
|
||||
sleepInfo.port = NULL;
|
||||
sleepInfo.connection = IORegisterForSystemPower
|
||||
(
|
||||
sleepInfo.connection = IORegisterForSystemPower(
|
||||
&sleepInfo, // void * refcon,
|
||||
&sleepInfo.port, // IONotificationPortRef * thePortRef,
|
||||
sleepCallback, // IOServiceInterestCallback callback,
|
||||
@@ -113,6 +107,3 @@ void ResumeSleep( void )
|
||||
vlog("*** ResumeSleep() is not implemented on this platform.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -20,5 +20,3 @@ void PreventSleep( void );
|
||||
void ResumeSleep(void);
|
||||
|
||||
#endif /* SLEEP_H */
|
||||
|
||||
|
||||
|
||||
@@ -17,9 +17,9 @@
|
||||
#include "FunctionList.h"
|
||||
|
||||
#if defined(__PPC__)
|
||||
// Global varaiable used to hold the FPU control register state. The FPSCR register can not
|
||||
// be used because not all Power implementations retain or observed the NI (non-IEEE
|
||||
// mode) bit.
|
||||
// Global varaiable used to hold the FPU control register state. The FPSCR
|
||||
// register can not be used because not all Power implementations retain or
|
||||
// observed the NI (non-IEEE mode) bit.
|
||||
__thread fpu_control_t fpu_control = 0;
|
||||
#endif
|
||||
|
||||
@@ -47,11 +47,13 @@ void AddD(double *rhi, double *rlo, double a, double b)
|
||||
{
|
||||
double zhi, zlo;
|
||||
zhi = a + b;
|
||||
if(fabs(a) > fabs(b)) {
|
||||
if (fabs(a) > fabs(b))
|
||||
{
|
||||
zlo = zhi - a;
|
||||
zlo = b - zlo;
|
||||
}
|
||||
else {
|
||||
else
|
||||
{
|
||||
zlo = zhi - b;
|
||||
zlo = a - zlo;
|
||||
}
|
||||
@@ -86,7 +88,8 @@ void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
|
||||
{
|
||||
double r, s;
|
||||
r = xh + yh;
|
||||
s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) : (yh - r + xh + xl + yl);
|
||||
s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl)
|
||||
: (yh - r + xh + xl + yl);
|
||||
*rhi = r + s;
|
||||
*rlo = (r - (*rhi)) + s;
|
||||
}
|
||||
@@ -100,69 +103,58 @@ void DivideDD(double *chi, double *clo, double a, double b)
|
||||
*clo = rhi / b;
|
||||
}
|
||||
|
||||
// These functions comapre two floats/doubles. Since some platforms may choose to
|
||||
// flush denormals to zeros before comparison, comparison like a < b may give wrong
|
||||
// result in "certain cases" where we do need correct compasion result when operands
|
||||
// are denormals .... these functions comapre floats/doubles using signed integer/long int
|
||||
// rep. In other cases, when flushing to zeros is fine, these should not be used.
|
||||
// Also these doesn't check for nans and assume nans are handled separately as special edge case
|
||||
// by the caller which calls these functions
|
||||
// return 0 if both are equal, 1 if x > y and -1 if x < y.
|
||||
// These functions comapre two floats/doubles. Since some platforms may choose
|
||||
// to flush denormals to zeros before comparison, comparison like a < b may give
|
||||
// wrong result in "certain cases" where we do need correct compasion result
|
||||
// when operands are denormals .... these functions comapre floats/doubles using
|
||||
// signed integer/long int rep. In other cases, when flushing to zeros is fine,
|
||||
// these should not be used. Also these doesn't check for nans and assume nans
|
||||
// are handled separately as special edge case by the caller which calls these
|
||||
// functions return 0 if both are equal, 1 if x > y and -1 if x < y.
|
||||
|
||||
inline
|
||||
int compareFloats(float x, float y)
|
||||
inline int compareFloats(float x, float y)
|
||||
{
|
||||
int32f_t a, b;
|
||||
|
||||
a.f = x;
|
||||
b.f = y;
|
||||
|
||||
if( a.i & 0x80000000 )
|
||||
a.i = 0x80000000 - a.i;
|
||||
if( b.i & 0x80000000 )
|
||||
b.i = 0x80000000 - b.i;
|
||||
if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
|
||||
if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
|
||||
|
||||
if( a.i == b.i )
|
||||
return 0;
|
||||
if (a.i == b.i) return 0;
|
||||
|
||||
return a.i < b.i ? -1 : 1;
|
||||
}
|
||||
|
||||
inline
|
||||
int compareDoubles(double x, double y)
|
||||
inline int compareDoubles(double x, double y)
|
||||
{
|
||||
int64d_t a, b;
|
||||
|
||||
a.d = x;
|
||||
b.d = y;
|
||||
|
||||
if( a.l & 0x8000000000000000LL )
|
||||
a.l = 0x8000000000000000LL - a.l;
|
||||
if( b.l & 0x8000000000000000LL )
|
||||
b.l = 0x8000000000000000LL - b.l;
|
||||
if (a.l & 0x8000000000000000LL) a.l = 0x8000000000000000LL - a.l;
|
||||
if (b.l & 0x8000000000000000LL) b.l = 0x8000000000000000LL - b.l;
|
||||
|
||||
if( a.l == b.l )
|
||||
return 0;
|
||||
if (a.l == b.l) return 0;
|
||||
|
||||
return a.l < b.l ? -1 : 1;
|
||||
}
|
||||
|
||||
void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed)
|
||||
void logFunctionInfo(const char *fname, unsigned int float_size,
|
||||
unsigned int isFastRelaxed)
|
||||
{
|
||||
char const *fpSizeStr = NULL;
|
||||
char const *fpFastRelaxedStr = "";
|
||||
switch (float_size) {
|
||||
case sizeof(cl_double):
|
||||
fpSizeStr = "fp64";
|
||||
break;
|
||||
case sizeof(cl_float):
|
||||
fpSizeStr = "fp32";
|
||||
break;
|
||||
case sizeof(cl_half):
|
||||
fpSizeStr = "fp16";
|
||||
break;
|
||||
switch (float_size)
|
||||
{
|
||||
case sizeof(cl_double): fpSizeStr = "fp64"; break;
|
||||
case sizeof(cl_float): fpSizeStr = "fp32"; break;
|
||||
case sizeof(cl_half): fpSizeStr = "fp16"; break;
|
||||
}
|
||||
if (isFastRelaxed) {
|
||||
if (isFastRelaxed)
|
||||
{
|
||||
fpFastRelaxedStr = "rlx";
|
||||
}
|
||||
vlog("%15s %4s %4s", fname, fpSizeStr, fpFastRelaxedStr);
|
||||
|
||||
@@ -110,13 +110,17 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
|
||||
static inline double DoubleFromUInt32(uint32_t bits);
|
||||
static inline double DoubleFromUInt32(uint32_t bits)
|
||||
{
|
||||
union{ uint64_t u; double d;} u;
|
||||
union {
|
||||
uint64_t u;
|
||||
double d;
|
||||
} u;
|
||||
|
||||
// split 0x89abcdef to 0x89abc00000000def
|
||||
u.u = bits & 0xfffU;
|
||||
u.u |= (uint64_t)(bits & ~0xfffU) << 32;
|
||||
|
||||
// sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s
|
||||
// sign extend the leading bit of def segment as sign bit so that the middle
|
||||
// region consists of either all 1s or 0s
|
||||
u.u -= (bits & 0x800U) << 1;
|
||||
|
||||
// return result
|
||||
@@ -128,8 +132,10 @@ void _LogBuildError( cl_program p, int line, const char *file );
|
||||
|
||||
#define PERF_LOOP_COUNT 100
|
||||
|
||||
//The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero.
|
||||
// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero.
|
||||
// The spec is fairly clear that we may enforce a hard cutoff to prevent
|
||||
// premature flushing to zero.
|
||||
// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +
|
||||
// ulp_limit to be flushed to zero.
|
||||
static inline int IsFloatResultSubnormal(double x, float ulps)
|
||||
{
|
||||
x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps;
|
||||
@@ -150,21 +156,30 @@ static inline int IsDoubleResultSubnormal( long double x, float ulps )
|
||||
|
||||
static inline int IsFloatInfinity(double x)
|
||||
{
|
||||
union { cl_float d; cl_uint u; } u;
|
||||
union {
|
||||
cl_float d;
|
||||
cl_uint u;
|
||||
} u;
|
||||
u.d = (cl_float)x;
|
||||
return ((u.u & 0x7fffffffU) == 0x7F800000U);
|
||||
}
|
||||
|
||||
static inline int IsFloatMaxFloat(double x)
|
||||
{
|
||||
union { cl_float d; cl_uint u; } u;
|
||||
union {
|
||||
cl_float d;
|
||||
cl_uint u;
|
||||
} u;
|
||||
u.d = (cl_float)x;
|
||||
return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
|
||||
}
|
||||
|
||||
static inline int IsFloatNaN(double x)
|
||||
{
|
||||
union { cl_float d; cl_uint u; } u;
|
||||
union {
|
||||
cl_float d;
|
||||
cl_uint u;
|
||||
} u;
|
||||
u.d = (cl_float)x;
|
||||
return ((u.u & 0x7fffffffU) > 0x7F800000U);
|
||||
}
|
||||
@@ -190,42 +205,46 @@ static inline void Force64BitFPUPrecision(void)
|
||||
new_cw = orig_cw | 0x0300; // set precision to 64-bit
|
||||
__asm__ __volatile__("fldcw %0" ::"m"(new_cw));
|
||||
#elif defined(_WIN32) && defined(__INTEL_COMPILER)
|
||||
// Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* work on win.x64:
|
||||
// > On the x64 architecture, changing the floating point precision is not supported.
|
||||
// (Taken from http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
|
||||
// Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not*
|
||||
// work on win.x64: > On the x64 architecture, changing the floating point
|
||||
// precision is not supported. (Taken from
|
||||
// http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
|
||||
int cw;
|
||||
__asm { fnstcw cw }; // Get current value of FPU control word.
|
||||
cw = cw & 0xfffffcff | ( 3 << 8 ); // Set Precision Control to Double Extended Precision.
|
||||
__asm { fldcw cw }; // Set new value of FPU control word.
|
||||
__asm { fnstcw cw }
|
||||
; // Get current value of FPU control word.
|
||||
cw = cw & 0xfffffcff
|
||||
| (3 << 8); // Set Precision Control to Double Extended Precision.
|
||||
__asm { fldcw cw }
|
||||
; // Set new value of FPU control word.
|
||||
#else
|
||||
/* Implement for other platforms if needed */
|
||||
#endif
|
||||
}
|
||||
|
||||
extern
|
||||
void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
|
||||
extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
|
||||
|
||||
typedef union
|
||||
{
|
||||
typedef union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} int32f_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
typedef union {
|
||||
int64_t l;
|
||||
double d;
|
||||
} int64d_t;
|
||||
|
||||
void MulD(double *rhi, double *rlo, double u, double v);
|
||||
void AddD(double *rhi, double *rlo, double a, double b);
|
||||
void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
|
||||
void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
|
||||
void MulDD(double *rhi, double *rlo, double xh, double xl, double yh,
|
||||
double yl);
|
||||
void AddDD(double *rhi, double *rlo, double xh, double xl, double yh,
|
||||
double yl);
|
||||
void DivideDD(double *chi, double *clo, double a, double b);
|
||||
int compareFloats(float x, float y);
|
||||
int compareDoubles(double x, double y);
|
||||
|
||||
void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed);
|
||||
void logFunctionInfo(const char *fname, unsigned int float_size,
|
||||
unsigned int isFastRelaxed);
|
||||
|
||||
float getAllowedUlpError(const Func *f, const bool relaxed);
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -33,24 +33,38 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
const char *c[] = { "__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global int",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i] );\n"
|
||||
"}\n" };
|
||||
const char *c3[] = {
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global int* out, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" int3 i0 = ",
|
||||
name,
|
||||
"( f0 );\n"
|
||||
" vstore3( i0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
@@ -61,7 +75,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" int3 i0 = ",
|
||||
name,
|
||||
"( f0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -86,7 +102,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -95,26 +112,40 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global int",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i] );\n"
|
||||
"}\n" };
|
||||
|
||||
const char *c3[] = {"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global double* in)\n"
|
||||
const char *c3[] = {
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global int* out, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" int3 i0 = ",
|
||||
name,
|
||||
"( f0 );\n"
|
||||
" vstore3( i0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" double3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
@@ -125,7 +156,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" int3 i0 = ",
|
||||
name,
|
||||
"( f0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -150,7 +183,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -164,8 +198,10 @@ typedef struct BuildKernelInfo
|
||||
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
|
||||
} BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -173,8 +209,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
|
||||
info->programs + i, info->relaxedMode);
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -206,12 +244,14 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
|
||||
f->nameInCode, relaxedMode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
if ((error = ThreadPool_Do(BuildKernel_FloatFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info)))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
|
||||
programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
for (i = 0; i < (1ULL << 32); i += step)
|
||||
@@ -228,7 +268,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
p[j] = (uint32_t)i + j;
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -239,9 +280,12 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -251,10 +295,22 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -262,8 +318,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
int *r = (int *)gOut_Ref;
|
||||
@@ -274,15 +329,16 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
if (gSkipCorrectnessTesting) break;
|
||||
|
||||
// Verify data
|
||||
uint32_t *t = (uint32_t *)gOut_Ref;
|
||||
@@ -298,14 +354,15 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
unsigned int correct0 = f->func.i_f(0.0);
|
||||
unsigned int correct1 = f->func.i_f(-0.0);
|
||||
if( q[j] == correct0 || q[j] == correct1 )
|
||||
continue;
|
||||
if (q[j] == correct0 || q[j] == correct1) continue;
|
||||
}
|
||||
|
||||
uint32_t err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %s%s: %d ulp error at %a (0x%8.8x): *%d vs. %d\n", f->name, sizeNames[k], err, ((float*) gIn)[j], ((cl_uint*) gIn)[j], t[j], q[j] );
|
||||
if (q[j] > t[j]) err = q[j] - t[j];
|
||||
vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
|
||||
"*%d vs. %d\n",
|
||||
f->name, sizeNames[k], err, ((float *)gIn)[j],
|
||||
((cl_uint *)gIn)[j], t[j], q[j]);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -316,8 +373,10 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -339,7 +398,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -351,15 +411,27 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (k = 0; k < PERF_LOOP_COUNT; k++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -375,14 +447,15 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(float));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -431,8 +504,8 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
|
||||
i, programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
for (i = 0; i < (1ULL << 32); i += step)
|
||||
@@ -449,7 +522,8 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
for (j = 0; j < bufferSize / sizeof(cl_double); j++)
|
||||
p[j] = DoubleFromUInt32((uint32_t)i + j);
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -460,9 +534,12 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -472,10 +549,22 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -483,8 +572,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
int *r = (int *)gOut_Ref;
|
||||
@@ -495,15 +583,16 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
if (gSkipCorrectnessTesting) break;
|
||||
|
||||
// Verify data
|
||||
uint32_t *t = (uint32_t *)gOut_Ref;
|
||||
@@ -519,14 +608,15 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
unsigned int correct0 = f->dfunc.i_f(0.0);
|
||||
unsigned int correct1 = f->dfunc.i_f(-0.0);
|
||||
if( q[j] == correct0 || q[j] == correct1 )
|
||||
continue;
|
||||
if (q[j] == correct0 || q[j] == correct1) continue;
|
||||
}
|
||||
|
||||
uint32_t err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", f->name, sizeNames[k], err, ((double*) gIn)[j], t[j], q[j] );
|
||||
if (q[j] > t[j]) err = q[j] - t[j];
|
||||
vlog_error(
|
||||
"\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n",
|
||||
f->name, sizeNames[k], err, ((double *)gIn)[j], t[j],
|
||||
q[j]);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -537,13 +627,14 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -561,7 +652,8 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
double *p = (double *)gIn;
|
||||
for (j = 0; j < bufferSize / sizeof(cl_double); j++)
|
||||
p[j] = DoubleFromUInt32(genrand_int32(d));
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -573,15 +665,27 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (k = 0; k < PERF_LOOP_COUNT; k++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -597,17 +701,17 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(double));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
|
||||
}
|
||||
|
||||
vlog("\n");
|
||||
@@ -624,4 +728,3 @@ exit:
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -33,24 +33,38 @@ static int BuildKernelDouble(const char *name, int vectorSize,
|
||||
static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
|
||||
cl_kernel *k, cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
const char *c[] = { "__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global int",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i] );\n"
|
||||
"}\n" };
|
||||
const char *c3[] = {
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global int* out, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" int3 i0 = ",
|
||||
name,
|
||||
"( f0 );\n"
|
||||
" vstore3( i0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" int3 i0;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
@@ -62,7 +76,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" i0 = ", name, "( f0 );\n"
|
||||
" i0 = ",
|
||||
name,
|
||||
"( f0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -86,7 +102,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
|
||||
relaxedMode);
|
||||
@@ -97,26 +114,40 @@ static int BuildKernelDouble(const char *name, int vectorSize,
|
||||
bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global long",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i] );\n"
|
||||
"}\n" };
|
||||
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in)\n"
|
||||
const char *c3[] = {
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global long* out, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 d0 = vload3( 0, in + 3 * i );\n"
|
||||
" long3 l0 = ", name, "( d0 );\n"
|
||||
" long3 l0 = ",
|
||||
name,
|
||||
"( d0 );\n"
|
||||
" vstore3( l0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" double3 d0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
@@ -127,7 +158,9 @@ static int BuildKernelDouble(const char *name, int vectorSize,
|
||||
" d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" long3 l0 = ", name, "( d0 );\n"
|
||||
" long3 l0 = ",
|
||||
name,
|
||||
"( d0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -152,7 +185,8 @@ static int BuildKernelDouble(const char *name, int vectorSize,
|
||||
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
|
||||
relaxedMode);
|
||||
@@ -168,8 +202,10 @@ typedef struct BuildKernelInfo
|
||||
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
|
||||
} BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -177,8 +213,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
|
||||
info->kernels[i], info->programs + i, info->relaxedMode);
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -200,8 +238,11 @@ typedef struct TestInfo
|
||||
size_t subBufferSize; // Size of the sub-buffer in elements
|
||||
const Func *f; // A pointer to the function info
|
||||
cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
|
||||
cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id]
|
||||
ThreadInfo *tinfo; // An array of thread specific information for each worker thread
|
||||
cl_kernel
|
||||
*k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
|
||||
// worker thread: k[vector_size][thread_id]
|
||||
ThreadInfo *
|
||||
tinfo; // An array of thread specific information for each worker thread
|
||||
cl_uint threadCount; // Number of worker threads
|
||||
cl_uint jobCount; // Number of jobs
|
||||
cl_uint step; // step between each chunk and the next.
|
||||
@@ -223,11 +264,14 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Init test_info
|
||||
memset(&test_info, 0, sizeof(test_info));
|
||||
test_info.threadCount = GetThreadCount();
|
||||
test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.subBufferSize = BUFFER_SIZE
|
||||
/ (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.scale = getTestScale(sizeof(cl_float));
|
||||
if (gWimpyMode)
|
||||
{
|
||||
test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.subBufferSize = gWimpyBufferSize
|
||||
/ (sizeof(cl_float)
|
||||
* RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
}
|
||||
|
||||
test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
|
||||
@@ -242,8 +286,10 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
test_info.f = f;
|
||||
test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
|
||||
// cl_kernels aren't thread safe, so we make one for each vector size for every thread
|
||||
test_info.ftz =
|
||||
f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
|
||||
// cl_kernels aren't thread safe, so we make one for each vector size for
|
||||
// every thread
|
||||
for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
|
||||
{
|
||||
size_t array_size = test_info.threadCount * sizeof(cl_kernel);
|
||||
@@ -256,34 +302,49 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
memset(test_info.k[i], 0, array_size);
|
||||
}
|
||||
test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
test_info.tinfo =
|
||||
(ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
|
||||
if (NULL == test_info.tinfo)
|
||||
{
|
||||
vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
|
||||
vlog_error(
|
||||
"Error: Unable to allocate storage for thread specific data.\n");
|
||||
error = CL_OUT_OF_HOST_MEMORY;
|
||||
goto exit;
|
||||
}
|
||||
memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
memset(test_info.tinfo, 0,
|
||||
test_info.threadCount * sizeof(*test_info.tinfo));
|
||||
for (i = 0; i < test_info.threadCount; i++)
|
||||
{
|
||||
cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
|
||||
test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
cl_buffer_region region = {
|
||||
i * test_info.subBufferSize * sizeof(cl_float),
|
||||
test_info.subBufferSize * sizeof(cl_float)
|
||||
};
|
||||
test_info.tinfo[i].inBuf =
|
||||
clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
if (error || NULL == test_info.tinfo[i].inBuf)
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
|
||||
"region {%zd, %zd}\n",
|
||||
region.origin, region.size);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
|
||||
gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion, &error);
|
||||
if (error || NULL == test_info.tinfo[i].outBuf[j])
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gOutBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
|
||||
"for region {%zd, %zd}\n",
|
||||
region.origin, region.size);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
|
||||
test_info.tinfo[i].tQueue =
|
||||
clCreateCommandQueue(gContext, gDevice, 0, &error);
|
||||
if (NULL == test_info.tinfo[i].tQueue || error)
|
||||
{
|
||||
vlog_error("clCreateCommandQueue failed. (%d)\n", error);
|
||||
@@ -297,7 +358,9 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
gMinVectorSizeIndex, test_info.threadCount, test_info.k,
|
||||
test_info.programs, f->nameInCode, relaxedMode
|
||||
};
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
if ((error = ThreadPool_Do(BuildKernel_FloatFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info)))
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -305,8 +368,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
|
||||
|
||||
if( error )
|
||||
goto exit;
|
||||
if (error) goto exit;
|
||||
|
||||
if (gWimpyMode)
|
||||
vlog("Wimp pass");
|
||||
@@ -320,7 +382,8 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
cl_uint *p = (cl_uint *)gIn;
|
||||
for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
BUFFER_SIZE, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -331,16 +394,29 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
size_t vectorSize = sizeof(cl_float) * sizeValues[j];
|
||||
size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
size_t localCount = (BUFFER_SIZE + vectorSize - 1)
|
||||
/ vectorSize; // BUFFER_SIZE / vectorSize rounded up
|
||||
if ((error = clSetKernelArg(test_info.k[j][0], 0,
|
||||
sizeof(gOutBuffer[j]), &gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(test_info.programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(test_info.programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (i = 0; i < PERF_LOOP_COUNT; i++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
|
||||
1, NULL, &localCount, NULL,
|
||||
0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -356,14 +432,15 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (BUFFER_SIZE / sizeof(float));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -413,8 +490,7 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
const char *name = job->f->name;
|
||||
|
||||
int signbit_test = 0;
|
||||
if(!strcmp(name, "signbit"))
|
||||
signbit_test = 1;
|
||||
if (!strcmp(name, "signbit")) signbit_test = 1;
|
||||
|
||||
#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
|
||||
|
||||
@@ -423,25 +499,27 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
cl_int *out[VECTOR_SIZE_COUNT];
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
|
||||
out[j] = (cl_int *)clEnqueueMapBuffer(
|
||||
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
|
||||
buffer_size, 0, NULL, e + j, &error);
|
||||
if (error || NULL == out[j])
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
|
||||
error);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Write the new values to the input array
|
||||
cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
p[j] = base + j * scale;
|
||||
for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
|
||||
|
||||
if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
|
||||
buffer_size, p, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
|
||||
return error;
|
||||
@@ -461,24 +539,39 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
return error;
|
||||
}
|
||||
|
||||
// Fill the result buffer with garbage, so that old results don't carry over
|
||||
// Fill the result buffer with garbage, so that old results don't carry
|
||||
// over
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(out[j], &pattern, buffer_size);
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
|
||||
out[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
|
||||
return error;
|
||||
}
|
||||
|
||||
// run the kernel
|
||||
size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
|
||||
cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel
|
||||
size_t vectorCount =
|
||||
(buffer_elements + sizeValues[j] - 1) / sizeValues[j];
|
||||
cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
|
||||
// own copy of the cl_kernel
|
||||
cl_program program = job->programs[j];
|
||||
|
||||
if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
|
||||
if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
|
||||
if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
|
||||
&tinfo->outBuf[j])))
|
||||
{
|
||||
LogBuildError(program);
|
||||
return error;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
|
||||
&tinfo->inBuf)))
|
||||
{
|
||||
LogBuildError(program);
|
||||
return error;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
|
||||
if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
|
||||
&vectorCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
return error;
|
||||
@@ -487,30 +580,33 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush 2 failed\n" );
|
||||
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
return CL_SUCCESS;
|
||||
if (gSkipCorrectnessTesting) return CL_SUCCESS;
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
|
||||
float *s = (float *)p;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
r[j] = ref_func( s[j] );
|
||||
for (j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
|
||||
|
||||
// Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
|
||||
// Read the data back -- no need to wait for the first N-1 buffers. This is
|
||||
// an in order queue.
|
||||
for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
out[j] = (cl_int *)clEnqueueMapBuffer(
|
||||
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
|
||||
buffer_size, 0, NULL, NULL, &error);
|
||||
if (error || NULL == out[j])
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
|
||||
error);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
// Wait for the last buffer
|
||||
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
|
||||
CL_TRUE, CL_MAP_READ, 0, buffer_size,
|
||||
0, NULL, NULL, &error);
|
||||
if (error || NULL == out[j])
|
||||
{
|
||||
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
|
||||
@@ -535,15 +631,14 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
{
|
||||
int correct = ref_func(+0.0f);
|
||||
int correct2 = ref_func(-0.0f);
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
if (correct == q[j] || correct2 == q[j]) continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", name, err, ((float*) s)[j], t[j], q[j] );
|
||||
if (q[j] > t[j]) err = q[j] - t[j];
|
||||
vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n",
|
||||
name, err, ((float *)s)[j], t[j], q[j]);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -561,15 +656,15 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
{
|
||||
int correct = -ref_func(+0.0f);
|
||||
int correct2 = -ref_func(-0.0f);
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
if (correct == q[j] || correct2 == q[j]) continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t err = -t[j] - q[j];
|
||||
if( q[j] > -t[j] )
|
||||
err = q[j] + t[j];
|
||||
vlog_error( "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, sizeNames[k], err, ((float*) s)[j], -t[j], q[j] );
|
||||
if (q[j] > -t[j]) err = q[j] + t[j];
|
||||
vlog_error(
|
||||
"\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name,
|
||||
sizeNames[k], err, ((float *)s)[j], -t[j], q[j]);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -581,9 +676,11 @@ exit:
|
||||
ret = error;
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
|
||||
out[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
|
||||
vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
|
||||
j, error);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
@@ -599,8 +696,12 @@ exit:
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
|
||||
} else
|
||||
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
|
||||
"ThreadCount:%2u\n",
|
||||
base, job->step, job->scale, buffer_elements,
|
||||
job->threadCount);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -622,11 +723,14 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Init test_info
|
||||
memset(&test_info, 0, sizeof(test_info));
|
||||
test_info.threadCount = GetThreadCount();
|
||||
test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.subBufferSize = BUFFER_SIZE
|
||||
/ (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.scale = getTestScale(sizeof(cl_double));
|
||||
if (gWimpyMode)
|
||||
{
|
||||
test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.subBufferSize = gWimpyBufferSize
|
||||
/ (sizeof(cl_double)
|
||||
* RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
}
|
||||
|
||||
test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
|
||||
@@ -643,7 +747,8 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
test_info.f = f;
|
||||
test_info.ftz = f->ftz || gForceFTZ;
|
||||
|
||||
// cl_kernels aren't thread safe, so we make one for each vector size for every thread
|
||||
// cl_kernels aren't thread safe, so we make one for each vector size for
|
||||
// every thread
|
||||
for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
|
||||
{
|
||||
size_t array_size = test_info.threadCount * sizeof(cl_kernel);
|
||||
@@ -656,36 +761,52 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
memset(test_info.k[i], 0, array_size);
|
||||
}
|
||||
test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
test_info.tinfo =
|
||||
(ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
|
||||
if (NULL == test_info.tinfo)
|
||||
{
|
||||
vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
|
||||
vlog_error(
|
||||
"Error: Unable to allocate storage for thread specific data.\n");
|
||||
error = CL_OUT_OF_HOST_MEMORY;
|
||||
goto exit;
|
||||
}
|
||||
memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
memset(test_info.tinfo, 0,
|
||||
test_info.threadCount * sizeof(*test_info.tinfo));
|
||||
for (i = 0; i < test_info.threadCount; i++)
|
||||
{
|
||||
cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
|
||||
test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
cl_buffer_region region = {
|
||||
i * test_info.subBufferSize * sizeof(cl_double),
|
||||
test_info.subBufferSize * sizeof(cl_double)
|
||||
};
|
||||
test_info.tinfo[i].inBuf =
|
||||
clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
if (error || NULL == test_info.tinfo[i].inBuf)
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
|
||||
"region {%zd, %zd}\n",
|
||||
region.origin, region.size);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
/* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
|
||||
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
/* Qualcomm fix: 9461 read-write flags must be compatible with
|
||||
* parent buffer */
|
||||
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
|
||||
gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion, &error);
|
||||
/* Qualcomm fix: end */
|
||||
if (error || NULL == test_info.tinfo[i].outBuf[j])
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
vlog_error("Error: Unable to create sub-buffer of gInBuffer "
|
||||
"for region {%zd, %zd}\n",
|
||||
region.origin, region.size);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
|
||||
test_info.tinfo[i].tQueue =
|
||||
clCreateCommandQueue(gContext, gDevice, 0, &error);
|
||||
if (NULL == test_info.tinfo[i].tQueue || error)
|
||||
{
|
||||
vlog_error("clCreateCommandQueue failed. (%d)\n", error);
|
||||
@@ -699,7 +820,9 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
gMinVectorSizeIndex, test_info.threadCount, test_info.k,
|
||||
test_info.programs, f->nameInCode, relaxedMode
|
||||
};
|
||||
if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info)))
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -707,8 +830,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
|
||||
|
||||
if( error )
|
||||
goto exit;
|
||||
if (error) goto exit;
|
||||
|
||||
if (gWimpyMode)
|
||||
vlog("Wimp pass");
|
||||
@@ -722,7 +844,8 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
cl_ulong *p = (cl_ulong *)gIn;
|
||||
for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
|
||||
p[j] = DoubleFromUInt32(genrand_int32(d));
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
BUFFER_SIZE, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -734,15 +857,27 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(test_info.k[j][0], 0,
|
||||
sizeof(gOutBuffer[j]), &gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(test_info.programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(test_info.programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (i = 0; i < PERF_LOOP_COUNT; i++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
|
||||
1, NULL, &localCount, NULL,
|
||||
0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -758,17 +893,17 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (BUFFER_SIZE / sizeof(double));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
|
||||
}
|
||||
|
||||
vlog("\n");
|
||||
@@ -822,24 +957,27 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
cl_long *out[VECTOR_SIZE_COUNT];
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
|
||||
out[j] = (cl_long *)clEnqueueMapBuffer(
|
||||
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
|
||||
buffer_size, 0, NULL, e + j, &error);
|
||||
if (error || NULL == out[j])
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
|
||||
error);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Write the new values to the input array
|
||||
cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
|
||||
for (j = 0; j < buffer_elements; j++)
|
||||
p[j] = DoubleFromUInt32(base + j * scale);
|
||||
|
||||
if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
|
||||
buffer_size, p, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
|
||||
return error;
|
||||
@@ -859,24 +997,39 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
return error;
|
||||
}
|
||||
|
||||
// Fill the result buffer with garbage, so that old results don't carry over
|
||||
// Fill the result buffer with garbage, so that old results don't carry
|
||||
// over
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(out[j], &pattern, buffer_size);
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
|
||||
out[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
|
||||
return error;
|
||||
}
|
||||
|
||||
// run the kernel
|
||||
size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
|
||||
cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel
|
||||
size_t vectorCount =
|
||||
(buffer_elements + sizeValues[j] - 1) / sizeValues[j];
|
||||
cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
|
||||
// own copy of the cl_kernel
|
||||
cl_program program = job->programs[j];
|
||||
|
||||
if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
|
||||
if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
|
||||
if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
|
||||
&tinfo->outBuf[j])))
|
||||
{
|
||||
LogBuildError(program);
|
||||
return error;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
|
||||
&tinfo->inBuf)))
|
||||
{
|
||||
LogBuildError(program);
|
||||
return error;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
|
||||
if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
|
||||
&vectorCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
return error;
|
||||
@@ -885,30 +1038,33 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush 2 failed\n" );
|
||||
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
return CL_SUCCESS;
|
||||
if (gSkipCorrectnessTesting) return CL_SUCCESS;
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
|
||||
cl_double *s = (cl_double *)p;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
r[j] = dfunc.i_f( s[j] );
|
||||
for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
|
||||
|
||||
// Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
|
||||
// Read the data back -- no need to wait for the first N-1 buffers. This is
|
||||
// an in order queue.
|
||||
for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
out[j] = (cl_long *)clEnqueueMapBuffer(
|
||||
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
|
||||
buffer_size, 0, NULL, NULL, &error);
|
||||
if (error || NULL == out[j])
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
|
||||
error);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
// Wait for the last buffer
|
||||
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
|
||||
CL_TRUE, CL_MAP_READ, 0, buffer_size,
|
||||
0, NULL, NULL, &error);
|
||||
if (error || NULL == out[j])
|
||||
{
|
||||
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
|
||||
@@ -933,15 +1089,14 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
{
|
||||
cl_long correct = dfunc.i_f(+0.0f);
|
||||
cl_long correct2 = dfunc.i_f(-0.0f);
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
if (correct == q[j] || correct2 == q[j]) continue;
|
||||
}
|
||||
}
|
||||
|
||||
cl_ulong err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", name, err, ((double*) gIn)[j], t[j], q[j] );
|
||||
if (q[j] > t[j]) err = q[j] - t[j];
|
||||
vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
|
||||
name, err, ((double *)gIn)[j], t[j], q[j]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -958,40 +1113,44 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
{
|
||||
int64_t correct = -dfunc.i_f(+0.0f);
|
||||
int64_t correct2 = -dfunc.i_f(-0.0f);
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
if (correct == q[j] || correct2 == q[j]) continue;
|
||||
}
|
||||
}
|
||||
|
||||
cl_ulong err = -t[j] - q[j];
|
||||
if( q[j] > -t[j] )
|
||||
err = q[j] + t[j];
|
||||
vlog_error( "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", name, sizeNames[k], err, ((double*) gIn)[j], -t[j], q[j] );
|
||||
if (q[j] > -t[j]) err = q[j] + t[j];
|
||||
vlog_error(
|
||||
"\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
|
||||
name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
|
||||
out[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
|
||||
vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
|
||||
j, error);
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush 3 failed\n" );
|
||||
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
|
||||
|
||||
|
||||
if (0 == (base & 0x0fffffff))
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
|
||||
} else
|
||||
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
|
||||
"ThreadCount:%2u\n",
|
||||
base, job->step, job->scale, buffer_elements,
|
||||
job->threadCount);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -1000,7 +1159,3 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -234,5 +234,3 @@ long double reference_assignmentl( long double x );
|
||||
int reference_notl(long double x);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -32,27 +32,43 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
const char *c[] = { "__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* out2, __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i], out2 + i );\n"
|
||||
"}\n" };
|
||||
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* out2, __global float* in)\n"
|
||||
const char *c3[] = {
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global float* out, __global float* out2, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" float3 iout = NAN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" float3 iout = NAN;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
@@ -64,7 +80,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -89,7 +107,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -98,28 +117,44 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* out2, __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i], out2 + i );\n"
|
||||
"}\n" };
|
||||
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* out2, __global double* in)\n"
|
||||
const char *c3[] = {
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global double* out, __global double* out2, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" double3 iout = NAN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" double3 iout = NAN;\n"
|
||||
" double3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
@@ -131,7 +166,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -156,7 +193,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -170,8 +208,10 @@ typedef struct BuildKernelInfo
|
||||
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
|
||||
} BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -179,8 +219,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
|
||||
info->programs + i, info->relaxedMode);
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -215,12 +257,14 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
|
||||
f->nameInCode, relaxedMode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
if ((error = ThreadPool_Do(BuildKernel_FloatFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info)))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
|
||||
programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
for (i = 0; i < (1ULL << 32); i += step)
|
||||
@@ -251,7 +295,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
}
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -262,16 +307,22 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -281,11 +332,28 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -293,8 +361,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
FPU_mode_type oldMode;
|
||||
RoundingMode oldRoundMode = kRoundToNearestEven;
|
||||
@@ -302,8 +369,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
// Calculate the correctly rounded reference result
|
||||
memset(&oldMode, 0, sizeof(oldMode));
|
||||
if( ftz )
|
||||
ForceFTZ( &oldMode );
|
||||
if (ftz) ForceFTZ(&oldMode);
|
||||
|
||||
// Set the rounding mode to match the device
|
||||
if (gIsInRTZMode)
|
||||
@@ -328,7 +394,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
r[j] = (float)f->func.f_fpf(s[j], &dd);
|
||||
|
||||
r2[j] = (float)dd;
|
||||
overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
|
||||
overflow[j] =
|
||||
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -345,18 +412,21 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
}
|
||||
|
||||
if( isFract && ftz )
|
||||
RestoreFPState( &oldMode );
|
||||
if (isFract && ftz) RestoreFPState(&oldMode);
|
||||
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray2 failed %d\n", error);
|
||||
goto exit;
|
||||
@@ -365,8 +435,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
|
||||
if (gSkipCorrectnessTesting)
|
||||
{
|
||||
if (isFract && gIsInRTZMode)
|
||||
(void)set_round(oldRoundMode, kfloat);
|
||||
if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -393,26 +462,31 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
else
|
||||
correct = f->func.f_fpf(s[j], &correct2);
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
|
||||
// Per section 10 paragraph 6, accept any result if an input
|
||||
// or output is a infinity or NaN or overflow
|
||||
if (relaxedMode || skipNanInf)
|
||||
{
|
||||
if (skipNanInf && overflow[j])
|
||||
continue;
|
||||
if (skipNanInf && overflow[j]) continue;
|
||||
|
||||
// Note: no double rounding here. Reference functions calculate in single precision.
|
||||
if( IsFloatInfinity(correct) || IsFloatNaN(correct) ||
|
||||
IsFloatInfinity(correct2)|| IsFloatNaN(correct2) ||
|
||||
IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) )
|
||||
// Note: no double rounding here. Reference functions
|
||||
// calculate in single precision.
|
||||
if (IsFloatInfinity(correct) || IsFloatNaN(correct)
|
||||
|| IsFloatInfinity(correct2) || IsFloatNaN(correct2)
|
||||
|| IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
|
||||
continue;
|
||||
}
|
||||
|
||||
typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
|
||||
typedef int (*CheckForSubnormal)(
|
||||
double, float); // If we are in fast relaxed math, we
|
||||
// have a different calculation for the
|
||||
// subnormal threshold.
|
||||
CheckForSubnormal isFloatResultSubnormalPtr;
|
||||
if (relaxedMode)
|
||||
{
|
||||
err = Abs_Error(test, correct);
|
||||
err2 = Abs_Error(test2, correct2);
|
||||
isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
|
||||
isFloatResultSubnormalPtr =
|
||||
&IsFloatResultSubnormalAbsError;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -420,14 +494,16 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
err2 = Ulp_Error(test2, correct2);
|
||||
isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
|
||||
}
|
||||
int fail = ! (fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps);
|
||||
int fail = !(fabsf(err) <= float_ulps
|
||||
&& fabsf(err2) <= float_ulps);
|
||||
|
||||
if (ftz)
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if ((*isFloatResultSubnormalPtr)(correct, float_ulps))
|
||||
{
|
||||
if( (*isFloatResultSubnormalPtr) (correct2, float_ulps ))
|
||||
if ((*isFloatResultSubnormalPtr)(correct2,
|
||||
float_ulps))
|
||||
{
|
||||
fail = fail && !(test == 0.0f && test2 == 0.0f);
|
||||
if (!fail)
|
||||
@@ -438,16 +514,18 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && fabsf(err2) <= float_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
fail = fail
|
||||
&& !(test == 0.0f
|
||||
&& fabsf(err2) <= float_ulps);
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( (*isFloatResultSubnormalPtr)(correct2, float_ulps ) )
|
||||
else if ((*isFloatResultSubnormalPtr)(correct2,
|
||||
float_ulps))
|
||||
{
|
||||
fail = fail && ! ( test2 == 0.0f && fabsf(err) <= float_ulps);
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
fail = fail
|
||||
&& !(test2 == 0.0f && fabsf(err) <= float_ulps);
|
||||
if (!fail) err2 = 0.0f;
|
||||
}
|
||||
|
||||
|
||||
@@ -458,8 +536,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
double correct2p, correct2n;
|
||||
float errp, err2p, errn, err2n;
|
||||
|
||||
if( skipNanInf )
|
||||
feclearexcept(FE_OVERFLOW);
|
||||
if (skipNanInf) feclearexcept(FE_OVERFLOW);
|
||||
if (relaxedMode)
|
||||
{
|
||||
correctp = f->rfunc.f_fpf(0.0, &correct2p);
|
||||
@@ -471,17 +548,23 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
correctn = f->func.f_fpf(-0.0, &correct2n);
|
||||
}
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
|
||||
// Per section 10 paragraph 6, accept any result if
|
||||
// an input or output is a infinity or NaN or
|
||||
// overflow
|
||||
if (skipNanInf)
|
||||
{
|
||||
if( fetestexcept(FE_OVERFLOW) )
|
||||
continue;
|
||||
if (fetestexcept(FE_OVERFLOW)) continue;
|
||||
|
||||
// Note: no double rounding here. Reference functions calculate in single precision.
|
||||
if( IsFloatInfinity(correctp) || IsFloatNaN(correctp) ||
|
||||
IsFloatInfinity(correctn) || IsFloatNaN(correctn) ||
|
||||
IsFloatInfinity(correct2p) || IsFloatNaN(correct2p) ||
|
||||
IsFloatInfinity(correct2n) || IsFloatNaN(correct2n) )
|
||||
// Note: no double rounding here. Reference
|
||||
// functions calculate in single precision.
|
||||
if (IsFloatInfinity(correctp)
|
||||
|| IsFloatNaN(correctp)
|
||||
|| IsFloatInfinity(correctn)
|
||||
|| IsFloatNaN(correctn)
|
||||
|| IsFloatInfinity(correct2p)
|
||||
|| IsFloatNaN(correct2p)
|
||||
|| IsFloatInfinity(correct2n)
|
||||
|| IsFloatNaN(correct2n))
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -500,38 +583,48 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
err2n = Ulp_Error(test, correct2n);
|
||||
}
|
||||
|
||||
fail = fail && ((!(fabsf(errp) <= float_ulps)) && (!(fabsf(err2p) <= float_ulps)) &&
|
||||
((!(fabsf(errn) <= float_ulps)) && (!(fabsf(err2n) <= float_ulps))) );
|
||||
if( fabsf( errp ) < fabsf(err ) )
|
||||
err = errp;
|
||||
if( fabsf( errn ) < fabsf(err ) )
|
||||
err = errn;
|
||||
if( fabsf( err2p ) < fabsf(err2 ) )
|
||||
err2 = err2p;
|
||||
if( fabsf( err2n ) < fabsf(err2 ) )
|
||||
err2 = err2n;
|
||||
fail = fail
|
||||
&& ((!(fabsf(errp) <= float_ulps))
|
||||
&& (!(fabsf(err2p) <= float_ulps))
|
||||
&& ((!(fabsf(errn) <= float_ulps))
|
||||
&& (!(fabsf(err2n) <= float_ulps))));
|
||||
if (fabsf(errp) < fabsf(err)) err = errp;
|
||||
if (fabsf(errn) < fabsf(err)) err = errn;
|
||||
if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
|
||||
if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( (*isFloatResultSubnormalPtr)( correctp, float_ulps ) || (*isFloatResultSubnormalPtr)( correctn, float_ulps ) )
|
||||
if ((*isFloatResultSubnormalPtr)(correctp,
|
||||
float_ulps)
|
||||
|| (*isFloatResultSubnormalPtr)(correctn,
|
||||
float_ulps))
|
||||
{
|
||||
if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
|
||||
if ((*isFloatResultSubnormalPtr)(correct2p,
|
||||
float_ulps)
|
||||
|| (*isFloatResultSubnormalPtr)(correct2n,
|
||||
float_ulps))
|
||||
{
|
||||
fail = fail && !( test == 0.0f && test2 == 0.0f);
|
||||
if( ! fail )
|
||||
err = err2 = 0.0f;
|
||||
fail = fail
|
||||
&& !(test == 0.0f && test2 == 0.0f);
|
||||
if (!fail) err = err2 = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! (test == 0.0f && fabsf(err2) <= float_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
fail = fail
|
||||
&& !(test == 0.0f
|
||||
&& fabsf(err2) <= float_ulps);
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
|
||||
else if ((*isFloatResultSubnormalPtr)(correct2p,
|
||||
float_ulps)
|
||||
|| (*isFloatResultSubnormalPtr)(
|
||||
correct2n, float_ulps))
|
||||
{
|
||||
fail = fail && ! (test2 == 0.0f && (fabsf(err) <= float_ulps));
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
fail = fail
|
||||
&& !(test2 == 0.0f
|
||||
&& (fabsf(err) <= float_ulps));
|
||||
if (!fail) err2 = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -547,7 +640,11 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
if (fail)
|
||||
{
|
||||
vlog_error( "\nERROR: %s%s: {%f, %f} ulp error at %a: *{%a, %a} vs. {%a, %a}\n", f->name, sizeNames[k], err, err2, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((float*) gOut_Ref2)[j], test, test2 );
|
||||
vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: "
|
||||
"*{%a, %a} vs. {%a, %a}\n",
|
||||
f->name, sizeNames[k], err, err2,
|
||||
((float *)gIn)[j], ((float *)gOut_Ref)[j],
|
||||
((float *)gOut_Ref2)[j], test, test2);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -555,15 +652,16 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
}
|
||||
|
||||
if (isFract && gIsInRTZMode)
|
||||
(void)set_round(oldRoundMode, kfloat);
|
||||
if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
|
||||
|
||||
if (0 == (i & 0x0fffffff))
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -585,7 +683,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -597,9 +696,24 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
@@ -607,7 +721,9 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -623,19 +739,21 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(float));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
}
|
||||
|
||||
if (!gSkipCorrectnessTesting)
|
||||
vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
|
||||
vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
|
||||
maxErrorVal1);
|
||||
vlog("\n");
|
||||
|
||||
exit:
|
||||
@@ -680,8 +798,8 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
|
||||
i, programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
for (i = 0; i < (1ULL << 32); i += step)
|
||||
@@ -698,7 +816,8 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
for (j = 0; j < bufferSize / sizeof(cl_double); j++)
|
||||
p[j] = DoubleFromUInt32((uint32_t)i + j);
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -709,16 +828,22 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -728,11 +853,28 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -740,8 +882,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
double *r = (double *)gOut_Ref;
|
||||
@@ -757,20 +898,23 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray2 failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
if (gSkipCorrectnessTesting) break;
|
||||
|
||||
// Verify data
|
||||
uint64_t *t = (uint64_t *)gOut_Ref;
|
||||
@@ -791,13 +935,15 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
long double correct = f->dfunc.f_fpf(s[j], &correct2);
|
||||
float err = Bruteforce_Ulp_Error_Double(test, correct);
|
||||
float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
|
||||
int fail = ! (fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps);
|
||||
int fail = !(fabsf(err) <= f->double_ulps
|
||||
&& fabsf(err2) <= f->double_ulps);
|
||||
if (ftz)
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if (IsDoubleResultSubnormal(correct, f->double_ulps))
|
||||
{
|
||||
if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
|
||||
if (IsDoubleResultSubnormal(correct2,
|
||||
f->double_ulps))
|
||||
{
|
||||
fail = fail && !(test == 0.0f && test2 == 0.0f);
|
||||
if (!fail)
|
||||
@@ -808,60 +954,80 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && fabsf(err2) <= f->double_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
fail = fail
|
||||
&& !(test == 0.0f
|
||||
&& fabsf(err2) <= f->double_ulps);
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
|
||||
else if (IsDoubleResultSubnormal(correct2,
|
||||
f->double_ulps))
|
||||
{
|
||||
fail = fail && ! ( test2 == 0.0f && fabsf(err) <= f->double_ulps);
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
fail = fail
|
||||
&& !(test2 == 0.0f
|
||||
&& fabsf(err) <= f->double_ulps);
|
||||
if (!fail) err2 = 0.0f;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
if (IsDoubleSubnormal(s[j]))
|
||||
{
|
||||
long double correct2p, correct2n;
|
||||
long double correctp = f->dfunc.f_fpf( 0.0, &correct2p );
|
||||
long double correctn = f->dfunc.f_fpf( -0.0, &correct2n );
|
||||
float errp = Bruteforce_Ulp_Error_Double( test, correctp );
|
||||
float err2p = Bruteforce_Ulp_Error_Double( test, correct2p );
|
||||
float errn = Bruteforce_Ulp_Error_Double( test, correctn );
|
||||
float err2n = Bruteforce_Ulp_Error_Double( test, correct2n );
|
||||
fail = fail && ((!(fabsf(errp) <= f->double_ulps)) && (!(fabsf(err2p) <= f->double_ulps)) &&
|
||||
((!(fabsf(errn) <= f->double_ulps)) && (!(fabsf(err2n) <= f->double_ulps))) );
|
||||
if( fabsf( errp ) < fabsf(err ) )
|
||||
err = errp;
|
||||
if( fabsf( errn ) < fabsf(err ) )
|
||||
err = errn;
|
||||
if( fabsf( err2p ) < fabsf(err2 ) )
|
||||
err2 = err2p;
|
||||
if( fabsf( err2n ) < fabsf(err2 ) )
|
||||
err2 = err2n;
|
||||
long double correctp =
|
||||
f->dfunc.f_fpf(0.0, &correct2p);
|
||||
long double correctn =
|
||||
f->dfunc.f_fpf(-0.0, &correct2n);
|
||||
float errp =
|
||||
Bruteforce_Ulp_Error_Double(test, correctp);
|
||||
float err2p =
|
||||
Bruteforce_Ulp_Error_Double(test, correct2p);
|
||||
float errn =
|
||||
Bruteforce_Ulp_Error_Double(test, correctn);
|
||||
float err2n =
|
||||
Bruteforce_Ulp_Error_Double(test, correct2n);
|
||||
fail = fail
|
||||
&& ((!(fabsf(errp) <= f->double_ulps))
|
||||
&& (!(fabsf(err2p) <= f->double_ulps))
|
||||
&& ((!(fabsf(errn) <= f->double_ulps))
|
||||
&& (!(fabsf(err2n)
|
||||
<= f->double_ulps))));
|
||||
if (fabsf(errp) < fabsf(err)) err = errp;
|
||||
if (fabsf(errn) < fabsf(err)) err = errn;
|
||||
if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
|
||||
if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( IsDoubleResultSubnormal( correctp, f->double_ulps ) || IsDoubleResultSubnormal( correctn, f->double_ulps ) )
|
||||
if (IsDoubleResultSubnormal(correctp,
|
||||
f->double_ulps)
|
||||
|| IsDoubleResultSubnormal(correctn,
|
||||
f->double_ulps))
|
||||
{
|
||||
if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
|
||||
if (IsDoubleResultSubnormal(correct2p,
|
||||
f->double_ulps)
|
||||
|| IsDoubleResultSubnormal(correct2n,
|
||||
f->double_ulps))
|
||||
{
|
||||
fail = fail && !( test == 0.0f && test2 == 0.0f);
|
||||
if( ! fail )
|
||||
err = err2 = 0.0f;
|
||||
fail = fail
|
||||
&& !(test == 0.0f && test2 == 0.0f);
|
||||
if (!fail) err = err2 = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! (test == 0.0f && fabsf(err2) <= f->double_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
fail = fail
|
||||
&& !(test == 0.0f
|
||||
&& fabsf(err2) <= f->double_ulps);
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
|
||||
else if (IsDoubleResultSubnormal(correct2p,
|
||||
f->double_ulps)
|
||||
|| IsDoubleResultSubnormal(correct2n,
|
||||
f->double_ulps))
|
||||
{
|
||||
fail = fail && ! (test2 == 0.0f && (fabsf(err) <= f->double_ulps));
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
fail = fail
|
||||
&& !(test2 == 0.0f
|
||||
&& (fabsf(err) <= f->double_ulps));
|
||||
if (!fail) err2 = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -877,7 +1043,12 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
if (fail)
|
||||
{
|
||||
vlog_error( "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: *{%.13la, %.13la} vs. {%.13la, %.13la}\n", f->name, sizeNames[k], err, err2, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((double*) gOut_Ref2)[j], test, test2 );
|
||||
vlog_error(
|
||||
"\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
|
||||
"*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
|
||||
f->name, sizeNames[k], err, err2,
|
||||
((double *)gIn)[j], ((double *)gOut_Ref)[j],
|
||||
((double *)gOut_Ref2)[j], test, test2);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -889,8 +1060,10 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -912,7 +1085,8 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
double *p = (double *)gIn;
|
||||
for (j = 0; j < bufferSize / sizeof(double); j++)
|
||||
p[j] = DoubleFromUInt32(genrand_int32(d));
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -924,9 +1098,24 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
@@ -934,7 +1123,9 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -950,21 +1141,22 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(double));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
|
||||
}
|
||||
|
||||
if (!gSkipCorrectnessTesting)
|
||||
vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
|
||||
vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
|
||||
maxErrorVal1);
|
||||
vlog("\n");
|
||||
|
||||
exit:
|
||||
@@ -977,6 +1169,3 @@ exit:
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -34,26 +34,42 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
const char *c[] = { "__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global int",
|
||||
sizeNames[vectorSize],
|
||||
"* out2, __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in)\n"
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i], out2 + i );\n"
|
||||
"}\n" };
|
||||
const char *c3[] = {
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global float* out, __global int* out2, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
@@ -65,7 +81,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -90,7 +108,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -99,27 +118,43 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global int",
|
||||
sizeNames[vectorSize],
|
||||
"* out2, __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in)\n"
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i], out2 + i );\n"
|
||||
"}\n" };
|
||||
const char *c3[] = {
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global double* out, __global int* out2, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" double3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
@@ -131,7 +166,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -156,7 +193,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -170,8 +208,10 @@ typedef struct BuildKernelInfo
|
||||
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
|
||||
} BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -179,8 +219,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
|
||||
info->programs + i, info->relaxedMode);
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -225,12 +267,14 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
|
||||
f->nameInCode, relaxedMode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
if ((error = ThreadPool_Do(BuildKernel_FloatFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info)))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
|
||||
programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
for (i = 0; i < (1ULL << 32); i += step)
|
||||
@@ -247,7 +291,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
p[j] = (uint32_t)i + j;
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -258,16 +303,22 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -277,11 +328,28 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -289,8 +357,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
float *r = (float *)gOut_Ref;
|
||||
@@ -302,20 +369,23 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray2 failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
if (gSkipCorrectnessTesting) break;
|
||||
|
||||
// Verify data
|
||||
uint32_t *t = (uint32_t *)gOut_Ref;
|
||||
@@ -335,15 +405,15 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
double correct = f->func.f_fpI(s[j], &correct2);
|
||||
float err = Ulp_Error(test, correct);
|
||||
cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
|
||||
int fail = ! (fabsf(err) <= float_ulps && abs_cl_long( iErr ) <= maxiError );
|
||||
int fail = !(fabsf(err) <= float_ulps
|
||||
&& abs_cl_long(iErr) <= maxiError);
|
||||
if (ftz)
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if (IsFloatResultSubnormal(correct, float_ulps))
|
||||
{
|
||||
fail = fail && !(test == 0.0f && iErr == 0);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
@@ -354,18 +424,22 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
double correct4 = f->func.f_fpI(-0.0, &correct6);
|
||||
float err2 = Ulp_Error(test, correct3);
|
||||
float err3 = Ulp_Error(test, correct4);
|
||||
cl_long iErr2 = (long long) q2[j] - (long long) correct5;
|
||||
cl_long iErr3 = (long long) q2[j] - (long long) correct6;
|
||||
cl_long iErr2 =
|
||||
(long long)q2[j] - (long long)correct5;
|
||||
cl_long iErr3 =
|
||||
(long long)q2[j] - (long long)correct6;
|
||||
|
||||
// Did +0 work?
|
||||
if( fabsf(err2) <= float_ulps && abs_cl_long( iErr2 ) <= maxiError )
|
||||
if (fabsf(err2) <= float_ulps
|
||||
&& abs_cl_long(iErr2) <= maxiError)
|
||||
{
|
||||
err = err2;
|
||||
iErr = iErr2;
|
||||
fail = 0;
|
||||
}
|
||||
// Did -0 work?
|
||||
else if(fabsf(err3) <= float_ulps && abs_cl_long( iErr3 ) <= maxiError)
|
||||
else if (fabsf(err3) <= float_ulps
|
||||
&& abs_cl_long(iErr3) <= maxiError)
|
||||
{
|
||||
err = err3;
|
||||
iErr = iErr3;
|
||||
@@ -373,9 +447,16 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( fail && (IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps )) )
|
||||
if (fail
|
||||
&& (IsFloatResultSubnormal(correct2, float_ulps)
|
||||
|| IsFloatResultSubnormal(correct3,
|
||||
float_ulps)))
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
|
||||
fail = fail
|
||||
&& !(test == 0.0f
|
||||
&& (abs_cl_long(iErr2) <= maxiError
|
||||
|| abs_cl_long(iErr3)
|
||||
<= maxiError));
|
||||
if (!fail)
|
||||
{
|
||||
err = 0.0f;
|
||||
@@ -397,7 +478,11 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
|
||||
if (fail)
|
||||
{
|
||||
vlog_error( "\nERROR: %s%s: {%f, %d} ulp error at %a: *{%a, %d} vs. {%a, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
|
||||
vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
|
||||
"*{%a, %d} vs. {%a, %d}\n",
|
||||
f->name, sizeNames[k], err, (int)iErr,
|
||||
((float *)gIn)[j], ((float *)gOut_Ref)[j],
|
||||
((int *)gOut_Ref2)[j], test, q2[j]);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -409,8 +494,10 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -432,7 +519,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -444,16 +532,33 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (k = 0; k < PERF_LOOP_COUNT; k++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -469,14 +574,15 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(float));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -528,8 +634,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
|
||||
i, programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
for (i = 0; i < (1ULL << 32); i += step)
|
||||
@@ -546,7 +652,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
for (j = 0; j < bufferSize / sizeof(double); j++)
|
||||
p[j] = DoubleFromUInt32((uint32_t)i + j);
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -557,16 +664,22 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -576,11 +689,28 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -588,8 +718,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
double *r = (double *)gOut_Ref;
|
||||
@@ -601,20 +730,23 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
|
||||
bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray2 failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
if (gSkipCorrectnessTesting) break;
|
||||
|
||||
// Verify data
|
||||
uint64_t *t = (uint64_t *)gOut_Ref;
|
||||
@@ -634,37 +766,45 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
long double correct = f->dfunc.f_fpI(s[j], &correct2);
|
||||
float err = Bruteforce_Ulp_Error_Double(test, correct);
|
||||
cl_long iErr = (long long)q2[j] - (long long)correct2;
|
||||
int fail = ! (fabsf(err) <= f->double_ulps && abs_cl_long( iErr ) <= maxiError );
|
||||
int fail = !(fabsf(err) <= f->double_ulps
|
||||
&& abs_cl_long(iErr) <= maxiError);
|
||||
if (ftz)
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if (IsDoubleResultSubnormal(correct, f->double_ulps))
|
||||
{
|
||||
fail = fail && !(test == 0.0f && iErr == 0);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
if (IsDoubleSubnormal(s[j]))
|
||||
{
|
||||
int correct5, correct6;
|
||||
long double correct3 = f->dfunc.f_fpI( 0.0, &correct5 );
|
||||
long double correct4 = f->dfunc.f_fpI( -0.0, &correct6 );
|
||||
float err2 = Bruteforce_Ulp_Error_Double( test, correct3 );
|
||||
float err3 = Bruteforce_Ulp_Error_Double( test, correct4 );
|
||||
cl_long iErr2 = (long long) q2[j] - (long long) correct5;
|
||||
cl_long iErr3 = (long long) q2[j] - (long long) correct6;
|
||||
long double correct3 =
|
||||
f->dfunc.f_fpI(0.0, &correct5);
|
||||
long double correct4 =
|
||||
f->dfunc.f_fpI(-0.0, &correct6);
|
||||
float err2 =
|
||||
Bruteforce_Ulp_Error_Double(test, correct3);
|
||||
float err3 =
|
||||
Bruteforce_Ulp_Error_Double(test, correct4);
|
||||
cl_long iErr2 =
|
||||
(long long)q2[j] - (long long)correct5;
|
||||
cl_long iErr3 =
|
||||
(long long)q2[j] - (long long)correct6;
|
||||
|
||||
// Did +0 work?
|
||||
if( fabsf(err2) <= f->double_ulps && abs_cl_long( iErr2 ) <= maxiError )
|
||||
if (fabsf(err2) <= f->double_ulps
|
||||
&& abs_cl_long(iErr2) <= maxiError)
|
||||
{
|
||||
err = err2;
|
||||
iErr = iErr2;
|
||||
fail = 0;
|
||||
}
|
||||
// Did -0 work?
|
||||
else if(fabsf(err3) <= f->double_ulps && abs_cl_long( iErr3 ) <= maxiError)
|
||||
else if (fabsf(err3) <= f->double_ulps
|
||||
&& abs_cl_long(iErr3) <= maxiError)
|
||||
{
|
||||
err = err3;
|
||||
iErr = iErr3;
|
||||
@@ -672,9 +812,17 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( fail && (IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )) )
|
||||
if (fail
|
||||
&& (IsDoubleResultSubnormal(correct2,
|
||||
f->double_ulps)
|
||||
|| IsDoubleResultSubnormal(correct3,
|
||||
f->double_ulps)))
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
|
||||
fail = fail
|
||||
&& !(test == 0.0f
|
||||
&& (abs_cl_long(iErr2) <= maxiError
|
||||
|| abs_cl_long(iErr3)
|
||||
<= maxiError));
|
||||
if (!fail)
|
||||
{
|
||||
err = 0.0f;
|
||||
@@ -696,7 +844,11 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
|
||||
if (fail)
|
||||
{
|
||||
vlog_error( "\nERROR: %sD%s: {%f, %d} ulp error at %.13la: *{%.13la, %d} vs. {%.13la, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
|
||||
vlog_error("\nERROR: %sD%s: {%f, %d} ulp error at "
|
||||
"%.13la: *{%.13la, %d} vs. {%.13la, %d}\n",
|
||||
f->name, sizeNames[k], err, (int)iErr,
|
||||
((double *)gIn)[j], ((double *)gOut_Ref)[j],
|
||||
((int *)gOut_Ref2)[j], test, q2[j]);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -708,8 +860,10 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -732,7 +886,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
|
||||
for (j = 0; j < bufferSize / sizeof(double); j++)
|
||||
p[j] = DoubleFromUInt32(genrand_int32(d));
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -744,16 +899,33 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
|
||||
&gOutBuffer2[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (k = 0; k < PERF_LOOP_COUNT; k++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILED -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -769,17 +941,17 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(double));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
|
||||
}
|
||||
|
||||
if (!gSkipCorrectnessTesting)
|
||||
@@ -796,6 +968,3 @@ exit:
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -33,25 +33,38 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = {
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global uint", sizeNames[vectorSize], "* in)\n"
|
||||
const char *c[] = { "__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global float",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global uint",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global uint* in)\n"
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i] );\n"
|
||||
"}\n" };
|
||||
const char *c3[] = {
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global float* out, __global uint* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" uint3 u0 = vload3( 0, in + 3 * i );\n"
|
||||
" float3 f0 = ", name, "( u0 );\n"
|
||||
" float3 f0 = ",
|
||||
name,
|
||||
"( u0 );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are "
|
||||
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
|
||||
"buffer size \n"
|
||||
" uint3 u0;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
@@ -63,7 +76,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
" u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( u0 );\n"
|
||||
" f0 = ",
|
||||
name,
|
||||
"( u0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -87,7 +102,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -95,39 +111,55 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
|
||||
static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
cl_program *p, bool relaxedMode)
|
||||
{
|
||||
const char *c[] = {
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global ulong", sizeNames[vectorSize], "* in)\n"
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global double",
|
||||
sizeNames[vectorSize],
|
||||
"* out, __global ulong",
|
||||
sizeNames[vectorSize],
|
||||
"* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
" out[i] = ",
|
||||
name,
|
||||
"( in[i] );\n"
|
||||
"}\n" };
|
||||
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global ulong* in)\n"
|
||||
"__kernel void math_kernel",
|
||||
sizeNames[vectorSize],
|
||||
"( __global double* out, __global ulong* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" ulong3 u0 = vload3( 0, in + 3 * i );\n"
|
||||
" double3 f0 = ", name, "( u0 );\n"
|
||||
" double3 f0 = ",
|
||||
name,
|
||||
"( u0 );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" size_t parity = i & 1; // Figure out how "
|
||||
"many elements are left over after BUFFER_SIZE % "
|
||||
"(3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" ulong3 u0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
|
||||
" u0 = (ulong3)( in[3*i], "
|
||||
"0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" u0 = (ulong3)( in[3*i], in[3*i+1], 0xdeaddeaddeaddeadUL ); \n"
|
||||
" u0 = (ulong3)( in[3*i], in[3*i+1], "
|
||||
"0xdeaddeaddeaddeadUL ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" double3 f0 = ", name, "( u0 );\n"
|
||||
" double3 f0 = ",
|
||||
name,
|
||||
"( u0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
@@ -138,8 +170,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
"}\n" };
|
||||
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c) / sizeof(c[0]);
|
||||
@@ -152,7 +183,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
|
||||
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
|
||||
sizeNames[vectorSize]);
|
||||
|
||||
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
|
||||
}
|
||||
@@ -166,8 +198,10 @@ typedef struct BuildKernelInfo
|
||||
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
|
||||
} BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -175,8 +209,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
|
||||
info->programs + i, info->relaxedMode);
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p);
|
||||
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
|
||||
void *p)
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo *)p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
@@ -212,23 +248,29 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
|
||||
f->nameInCode, relaxedMode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
if ((error = ThreadPool_Do(BuildKernel_FloatFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info)))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
|
||||
programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
|
||||
{
|
||||
isRangeLimited = 1;
|
||||
half_sin_cos_tan_limit = 1.0f + float_ulps * (FLT_EPSILON/2.0f); // out of range results from finite inputs must be in [-1,1]
|
||||
half_sin_cos_tan_limit = 1.0f
|
||||
+ float_ulps
|
||||
* (FLT_EPSILON / 2.0f); // out of range results from finite
|
||||
// inputs must be in [-1,1]
|
||||
}
|
||||
else if (0 == strcmp(f->name, "half_tan"))
|
||||
{
|
||||
isRangeLimited = 1;
|
||||
half_sin_cos_tan_limit = INFINITY; // out of range resut from finite inputs must be numeric
|
||||
half_sin_cos_tan_limit =
|
||||
INFINITY; // out of range resut from finite inputs must be numeric
|
||||
}
|
||||
|
||||
|
||||
@@ -246,7 +288,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
p[j] = (uint32_t)i + j;
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -257,9 +300,12 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -269,10 +315,22 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILURE -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -280,8 +338,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
float *r = (float *)gOut_Ref;
|
||||
@@ -292,15 +349,16 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
if (gSkipCorrectnessTesting) break;
|
||||
|
||||
|
||||
// Verify data
|
||||
@@ -320,7 +378,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
int fail = !(fabsf(err) <= float_ulps);
|
||||
|
||||
// half_sin/cos/tan are only valid between +-2**16, Inf, NaN
|
||||
if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
|
||||
if (isRangeLimited
|
||||
&& fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
|
||||
&& fabsf(s[j]) < INFINITY)
|
||||
{
|
||||
if (fabsf(test) <= half_sin_cos_tan_limit)
|
||||
{
|
||||
@@ -337,8 +397,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
if (IsFloatResultSubnormal(correct, float_ulps))
|
||||
{
|
||||
fail = fail && (test != 0.0f);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -349,7 +408,10 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
if (fail)
|
||||
{
|
||||
vlog_error( "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", f->name, sizeNames[k], err, ((uint32_t*) gIn)[j], ((float*) gOut_Ref)[j], test );
|
||||
vlog_error(
|
||||
"\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n",
|
||||
f->name, sizeNames[k], err, ((uint32_t *)gIn)[j],
|
||||
((float *)gOut_Ref)[j], test);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -361,8 +423,10 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -383,7 +447,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
// Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
|
||||
if (strstr(f->name, "exp") || strstr(f->name, "sin")
|
||||
|| strstr(f->name, "cos") || strstr(f->name, "tan"))
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
((float *)p)[j] = (float)genrand_real1(d);
|
||||
else if (strstr(f->name, "log"))
|
||||
@@ -392,7 +457,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
else
|
||||
for (j = 0; j < bufferSize / sizeof(float); j++)
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -404,15 +470,27 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (k = 0; k < PERF_LOOP_COUNT; k++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILURE -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -428,19 +506,19 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(float));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
|
||||
if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
|
||||
vlog("\n");
|
||||
|
||||
exit:
|
||||
@@ -487,18 +565,18 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
|
||||
i, programs + i) ) ) return error;
|
||||
*/
|
||||
|
||||
for (i = 0; i < (1ULL << 32); i += step)
|
||||
{
|
||||
// Init input array
|
||||
cl_ulong *p = (cl_ulong *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_ulong ); j++ )
|
||||
p[j] = random64(d);
|
||||
for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
|
||||
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -509,9 +587,12 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
|
||||
error, j);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
@@ -521,10 +602,22 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("FAILURE -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -532,8 +625,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
|
||||
|
||||
// Calculate the correctly rounded reference result
|
||||
double *r = (double *)gOut_Ref;
|
||||
@@ -544,15 +636,16 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Read the data back
|
||||
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
if ((error =
|
||||
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
|
||||
bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("ReadArray failed %d\n", error);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
if (gSkipCorrectnessTesting) break;
|
||||
|
||||
|
||||
// Verify data
|
||||
@@ -577,11 +670,11 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
if (ftz)
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if( IsDoubleResultSubnormal(correct, f->double_ulps) )
|
||||
if (IsDoubleResultSubnormal(correct,
|
||||
f->double_ulps))
|
||||
{
|
||||
fail = fail && (test != 0.0);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
if (!fail) err = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -592,7 +685,11 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
}
|
||||
if (fail)
|
||||
{
|
||||
vlog_error( "\n%s%sD: %f ulp error at 0x%16.16llx: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, ((uint64_t*) gIn)[j], ((double*) gOut_Ref)[j], test );
|
||||
vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
|
||||
"*%.13la vs. %.13la\n",
|
||||
f->name, sizeNames[k], err,
|
||||
((uint64_t *)gIn)[j],
|
||||
((double *)gOut_Ref)[j], test);
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
@@ -604,8 +701,10 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
|
||||
bufferSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
@@ -626,9 +725,9 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
// Init input array
|
||||
double *p = (double *)gIn;
|
||||
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
p[j] = random64(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
|
||||
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
|
||||
bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
|
||||
return error;
|
||||
@@ -640,15 +739,27 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
|
||||
&gOutBuffer[j])))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
|
||||
&gInBuffer)))
|
||||
{
|
||||
LogBuildError(programs[j]);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for (k = 0; k < PERF_LOOP_COUNT; k++)
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
|
||||
&localCount, NULL, 0, NULL,
|
||||
NULL)))
|
||||
{
|
||||
vlog_error("FAILURE -- could not execute kernel\n");
|
||||
goto exit;
|
||||
@@ -664,21 +775,20 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime(endTime, startTime);
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
if (time < bestTime) bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
||||
* gComputeDevices * gSimdSize * 1e6
|
||||
/ (bufferSize / sizeof(double));
|
||||
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
|
||||
f->name, sizeNames[j]);
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
|
||||
if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
|
||||
vlog("\n");
|
||||
|
||||
exit:
|
||||
@@ -691,4 +801,3 @@ exit:
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user