Apply clang-format on math_brute_force (#1104)

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
This commit is contained in:
Marco Antognini
2021-01-14 13:27:18 +00:00
committed by GitHub
parent ffa75c37ce
commit e5f89249fa
22 changed files with 14745 additions and 10770 deletions

View File

@@ -105,11 +105,14 @@
extern const vtbl _unary; // float foo( float ) extern const vtbl _unary; // float foo( float )
extern const vtbl _unary_u; // float foo( uint ), double foo( ulong ) extern const vtbl _unary_u; // float foo( uint ), double foo( ulong )
extern const vtbl _i_unary; // int foo( float ) extern const vtbl _i_unary; // int foo( float )
extern const vtbl _macro_unary; // int foo( float ), returns {0,1} for scalar, { 0, -1 } for vector extern const vtbl _macro_unary; // int foo( float ), returns {0,1} for scalar,
// { 0, -1 } for vector
extern const vtbl _binary; // float foo( float, float ) extern const vtbl _binary; // float foo( float, float )
extern const vtbl _binary_nextafter; // float foo( float, float ), special handling for nextafter extern const vtbl _binary_nextafter; // float foo( float, float ), special
// handling for nextafter
extern const vtbl _binary_operator; // float .op. float extern const vtbl _binary_operator; // float .op. float
extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for scalar, { 0, -1 } for vector extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for
// scalar, { 0, -1 } for vector
extern const vtbl _binary_i; // float foo( float, int ) extern const vtbl _binary_i; // float foo( float, int )
extern const vtbl _ternary; // float foo( float, float, float ) extern const vtbl _ternary; // float foo( float, float, float )
extern const vtbl _unary_two_results; // float foo( float, float * ) extern const vtbl _unary_two_results; // float foo( float, float * )

View File

@@ -30,8 +30,7 @@
#include "harness/mt19937.h" #include "harness/mt19937.h"
typedef union fptr typedef union fptr {
{
void *p; void *p;
double (*f_f)(double); double (*f_f)(double);
double (*f_u)(cl_uint); double (*f_u)(cl_uint);
@@ -48,8 +47,7 @@ typedef union fptr
float (*f_fma)(float, float, float, int); float (*f_fma)(float, float, float, int);
} fptr; } fptr;
typedef union dptr typedef union dptr {
{
void *p; void *p;
long double (*f_f)(long double); long double (*f_f)(long double);
long double (*f_u)(cl_ulong); long double (*f_u)(cl_ulong);
@@ -77,7 +75,8 @@ typedef struct vtbl
typedef struct Func typedef struct Func
{ {
const char *name; // common name, to be used as an argument in the shell const char *name; // common name, to be used as an argument in the shell
const char *nameInCode; // name as it appears in the __kernel, usually the same as name, but different for multiplication const char *nameInCode; // name as it appears in the __kernel, usually the
// same as name, but different for multiplication
fptr func; fptr func;
dptr dfunc; dptr dfunc;
fptr rfunc; fptr rfunc;
@@ -97,5 +96,3 @@ extern const Func functionList[];
extern const size_t functionListCount; extern const size_t functionListCount;
#endif #endif

View File

@@ -27,22 +27,19 @@
io_object_t iterator; io_object_t iterator;
} sleepInfo; } sleepInfo;
void sleepCallback( void * refcon, void sleepCallback(void* refcon, io_service_t service, natural_t messageType,
io_service_t service,
natural_t messageType,
void* messageArgument); void* messageArgument);
void sleepCallback( void * refcon UNUSED, void sleepCallback(void* refcon UNUSED, io_service_t service UNUSED,
io_service_t service UNUSED, natural_t messageType, void* messageArgument)
natural_t messageType,
void * messageArgument )
{ {
IOReturn result; IOReturn result;
/* /*
service -- The IOService whose state has changed. service -- The IOService whose state has changed.
messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the IOService's family. messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the
messageArgument -- An argument for the message, dependent on the messageType. IOService's family. messageArgument -- An argument for the message,
dependent on the messageType.
*/ */
switch (messageType) switch (messageType)
{ {
@@ -59,7 +56,8 @@
// this notification. Important: if you dont respond, there will // this notification. Important: if you dont respond, there will
// be a 30-second timeout before the computer sleeps. // be a 30-second timeout before the computer sleeps.
// IOCancelPowerChange(root_port,(long)messageArgument); // IOCancelPowerChange(root_port,(long)messageArgument);
result = IOCancelPowerChange(sleepInfo.connection,(long)messageArgument); result = IOCancelPowerChange(sleepInfo.connection,
(long)messageArgument);
if (kIOReturnSuccess != result) if (kIOReturnSuccess != result)
vlog("sleep prevention failed. (%d)\n", result); vlog("sleep prevention failed. (%d)\n", result);
break; break;
@@ -71,17 +69,13 @@
#endif #endif
void PreventSleep(void) void PreventSleep(void)
{ {
#if defined(__APPLE__) #if defined(__APPLE__)
vlog("Disabling sleep... "); vlog("Disabling sleep... ");
sleepInfo.iterator = (io_object_t)0; sleepInfo.iterator = (io_object_t)0;
sleepInfo.port = NULL; sleepInfo.port = NULL;
sleepInfo.connection = IORegisterForSystemPower sleepInfo.connection = IORegisterForSystemPower(
(
&sleepInfo, // void * refcon, &sleepInfo, // void * refcon,
&sleepInfo.port, // IONotificationPortRef * thePortRef, &sleepInfo.port, // IONotificationPortRef * thePortRef,
sleepCallback, // IOServiceInterestCallback callback, sleepCallback, // IOServiceInterestCallback callback,
@@ -113,6 +107,3 @@ void ResumeSleep( void )
vlog("*** ResumeSleep() is not implemented on this platform.\n"); vlog("*** ResumeSleep() is not implemented on this platform.\n");
#endif #endif
} }

View File

@@ -20,5 +20,3 @@ void PreventSleep( void );
void ResumeSleep(void); void ResumeSleep(void);
#endif /* SLEEP_H */ #endif /* SLEEP_H */

View File

@@ -17,9 +17,9 @@
#include "FunctionList.h" #include "FunctionList.h"
#if defined(__PPC__) #if defined(__PPC__)
// Global varaiable used to hold the FPU control register state. The FPSCR register can not // Global varaiable used to hold the FPU control register state. The FPSCR
// be used because not all Power implementations retain or observed the NI (non-IEEE // register can not be used because not all Power implementations retain or
// mode) bit. // observed the NI (non-IEEE mode) bit.
__thread fpu_control_t fpu_control = 0; __thread fpu_control_t fpu_control = 0;
#endif #endif
@@ -47,11 +47,13 @@ void AddD(double *rhi, double *rlo, double a, double b)
{ {
double zhi, zlo; double zhi, zlo;
zhi = a + b; zhi = a + b;
if(fabs(a) > fabs(b)) { if (fabs(a) > fabs(b))
{
zlo = zhi - a; zlo = zhi - a;
zlo = b - zlo; zlo = b - zlo;
} }
else { else
{
zlo = zhi - b; zlo = zhi - b;
zlo = a - zlo; zlo = a - zlo;
} }
@@ -86,7 +88,8 @@ void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
{ {
double r, s; double r, s;
r = xh + yh; r = xh + yh;
s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) : (yh - r + xh + xl + yl); s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl)
: (yh - r + xh + xl + yl);
*rhi = r + s; *rhi = r + s;
*rlo = (r - (*rhi)) + s; *rlo = (r - (*rhi)) + s;
} }
@@ -100,69 +103,58 @@ void DivideDD(double *chi, double *clo, double a, double b)
*clo = rhi / b; *clo = rhi / b;
} }
// These functions comapre two floats/doubles. Since some platforms may choose to // These functions comapre two floats/doubles. Since some platforms may choose
// flush denormals to zeros before comparison, comparison like a < b may give wrong // to flush denormals to zeros before comparison, comparison like a < b may give
// result in "certain cases" where we do need correct compasion result when operands // wrong result in "certain cases" where we do need correct compasion result
// are denormals .... these functions comapre floats/doubles using signed integer/long int // when operands are denormals .... these functions comapre floats/doubles using
// rep. In other cases, when flushing to zeros is fine, these should not be used. // signed integer/long int rep. In other cases, when flushing to zeros is fine,
// Also these doesn't check for nans and assume nans are handled separately as special edge case // these should not be used. Also these doesn't check for nans and assume nans
// by the caller which calls these functions // are handled separately as special edge case by the caller which calls these
// return 0 if both are equal, 1 if x > y and -1 if x < y. // functions return 0 if both are equal, 1 if x > y and -1 if x < y.
inline inline int compareFloats(float x, float y)
int compareFloats(float x, float y)
{ {
int32f_t a, b; int32f_t a, b;
a.f = x; a.f = x;
b.f = y; b.f = y;
if( a.i & 0x80000000 ) if (a.i & 0x80000000) a.i = 0x80000000 - a.i;
a.i = 0x80000000 - a.i; if (b.i & 0x80000000) b.i = 0x80000000 - b.i;
if( b.i & 0x80000000 )
b.i = 0x80000000 - b.i;
if( a.i == b.i ) if (a.i == b.i) return 0;
return 0;
return a.i < b.i ? -1 : 1; return a.i < b.i ? -1 : 1;
} }
inline inline int compareDoubles(double x, double y)
int compareDoubles(double x, double y)
{ {
int64d_t a, b; int64d_t a, b;
a.d = x; a.d = x;
b.d = y; b.d = y;
if( a.l & 0x8000000000000000LL ) if (a.l & 0x8000000000000000LL) a.l = 0x8000000000000000LL - a.l;
a.l = 0x8000000000000000LL - a.l; if (b.l & 0x8000000000000000LL) b.l = 0x8000000000000000LL - b.l;
if( b.l & 0x8000000000000000LL )
b.l = 0x8000000000000000LL - b.l;
if( a.l == b.l ) if (a.l == b.l) return 0;
return 0;
return a.l < b.l ? -1 : 1; return a.l < b.l ? -1 : 1;
} }
void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed) void logFunctionInfo(const char *fname, unsigned int float_size,
unsigned int isFastRelaxed)
{ {
char const *fpSizeStr = NULL; char const *fpSizeStr = NULL;
char const *fpFastRelaxedStr = ""; char const *fpFastRelaxedStr = "";
switch (float_size) { switch (float_size)
case sizeof(cl_double): {
fpSizeStr = "fp64"; case sizeof(cl_double): fpSizeStr = "fp64"; break;
break; case sizeof(cl_float): fpSizeStr = "fp32"; break;
case sizeof(cl_float): case sizeof(cl_half): fpSizeStr = "fp16"; break;
fpSizeStr = "fp32";
break;
case sizeof(cl_half):
fpSizeStr = "fp16";
break;
} }
if (isFastRelaxed) { if (isFastRelaxed)
{
fpFastRelaxedStr = "rlx"; fpFastRelaxedStr = "rlx";
} }
vlog("%15s %4s %4s", fname, fpSizeStr, fpFastRelaxedStr); vlog("%15s %4s %4s", fname, fpSizeStr, fpFastRelaxedStr);

View File

@@ -110,13 +110,17 @@ int MakeKernels(const char **c, cl_uint count, const char *name,
static inline double DoubleFromUInt32(uint32_t bits); static inline double DoubleFromUInt32(uint32_t bits);
static inline double DoubleFromUInt32(uint32_t bits) static inline double DoubleFromUInt32(uint32_t bits)
{ {
union{ uint64_t u; double d;} u; union {
uint64_t u;
double d;
} u;
// split 0x89abcdef to 0x89abc00000000def // split 0x89abcdef to 0x89abc00000000def
u.u = bits & 0xfffU; u.u = bits & 0xfffU;
u.u |= (uint64_t)(bits & ~0xfffU) << 32; u.u |= (uint64_t)(bits & ~0xfffU) << 32;
// sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s // sign extend the leading bit of def segment as sign bit so that the middle
// region consists of either all 1s or 0s
u.u -= (bits & 0x800U) << 1; u.u -= (bits & 0x800U) << 1;
// return result // return result
@@ -128,8 +132,10 @@ void _LogBuildError( cl_program p, int line, const char *file );
#define PERF_LOOP_COUNT 100 #define PERF_LOOP_COUNT 100
//The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero. // The spec is fairly clear that we may enforce a hard cutoff to prevent
// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero. // premature flushing to zero.
// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +
// ulp_limit to be flushed to zero.
static inline int IsFloatResultSubnormal(double x, float ulps) static inline int IsFloatResultSubnormal(double x, float ulps)
{ {
x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps; x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps;
@@ -150,21 +156,30 @@ static inline int IsDoubleResultSubnormal( long double x, float ulps )
static inline int IsFloatInfinity(double x) static inline int IsFloatInfinity(double x)
{ {
union { cl_float d; cl_uint u; } u; union {
cl_float d;
cl_uint u;
} u;
u.d = (cl_float)x; u.d = (cl_float)x;
return ((u.u & 0x7fffffffU) == 0x7F800000U); return ((u.u & 0x7fffffffU) == 0x7F800000U);
} }
static inline int IsFloatMaxFloat(double x) static inline int IsFloatMaxFloat(double x)
{ {
union { cl_float d; cl_uint u; } u; union {
cl_float d;
cl_uint u;
} u;
u.d = (cl_float)x; u.d = (cl_float)x;
return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU); return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
} }
static inline int IsFloatNaN(double x) static inline int IsFloatNaN(double x)
{ {
union { cl_float d; cl_uint u; } u; union {
cl_float d;
cl_uint u;
} u;
u.d = (cl_float)x; u.d = (cl_float)x;
return ((u.u & 0x7fffffffU) > 0x7F800000U); return ((u.u & 0x7fffffffU) > 0x7F800000U);
} }
@@ -190,42 +205,46 @@ static inline void Force64BitFPUPrecision(void)
new_cw = orig_cw | 0x0300; // set precision to 64-bit new_cw = orig_cw | 0x0300; // set precision to 64-bit
__asm__ __volatile__("fldcw %0" ::"m"(new_cw)); __asm__ __volatile__("fldcw %0" ::"m"(new_cw));
#elif defined(_WIN32) && defined(__INTEL_COMPILER) #elif defined(_WIN32) && defined(__INTEL_COMPILER)
// Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* work on win.x64: // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not*
// > On the x64 architecture, changing the floating point precision is not supported. // work on win.x64: > On the x64 architecture, changing the floating point
// (Taken from http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx) // precision is not supported. (Taken from
// http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
int cw; int cw;
__asm { fnstcw cw }; // Get current value of FPU control word. __asm { fnstcw cw }
cw = cw & 0xfffffcff | ( 3 << 8 ); // Set Precision Control to Double Extended Precision. ; // Get current value of FPU control word.
__asm { fldcw cw }; // Set new value of FPU control word. cw = cw & 0xfffffcff
| (3 << 8); // Set Precision Control to Double Extended Precision.
__asm { fldcw cw }
; // Set new value of FPU control word.
#else #else
/* Implement for other platforms if needed */ /* Implement for other platforms if needed */
#endif #endif
} }
extern extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
typedef union typedef union {
{
int32_t i; int32_t i;
float f; float f;
} int32f_t; } int32f_t;
typedef union typedef union {
{
int64_t l; int64_t l;
double d; double d;
} int64d_t; } int64d_t;
void MulD(double *rhi, double *rlo, double u, double v); void MulD(double *rhi, double *rlo, double u, double v);
void AddD(double *rhi, double *rlo, double a, double b); void AddD(double *rhi, double *rlo, double a, double b);
void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl); void MulDD(double *rhi, double *rlo, double xh, double xl, double yh,
void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl); double yl);
void AddDD(double *rhi, double *rlo, double xh, double xl, double yh,
double yl);
void DivideDD(double *chi, double *clo, double a, double b); void DivideDD(double *chi, double *clo, double a, double b);
int compareFloats(float x, float y); int compareFloats(float x, float y);
int compareDoubles(double x, double y); int compareDoubles(double x, double y);
void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed); void logFunctionInfo(const char *fname, unsigned int float_size,
unsigned int isFastRelaxed);
float getAllowedUlpError(const Func *f, const bool relaxed); float getAllowedUlpError(const Func *f, const bool relaxed);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -33,24 +33,38 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n" const char *c[] = { "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global int",
sizeNames[vectorSize],
"* out, __global float",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i] );\n" " out[i] = ",
"}\n" name,
}; "( in[i] );\n"
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n" "}\n" };
const char *c3[] = {
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global int* out, __global float* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" float3 f0 = vload3( 0, in + 3 * i );\n" " float3 f0 = vload3( 0, in + 3 * i );\n"
" int3 i0 = ", name, "( f0 );\n" " int3 i0 = ",
name,
"( f0 );\n"
" vstore3( i0, 0, out + 3*i );\n" " vstore3( i0, 0, out + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" float3 f0;\n" " float3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
@@ -61,7 +75,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" int3 i0 = ", name, "( f0 );\n" " int3 i0 = ",
name,
"( f0 );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -86,7 +102,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
} }
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -95,26 +112,40 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n" "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global int",
sizeNames[vectorSize],
"* out, __global double",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i] );\n" " out[i] = ",
"}\n" name,
}; "( in[i] );\n"
"}\n" };
const char *c3[] = {"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c3[] = {
"__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global double* in)\n" "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global int* out, __global double* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" double3 f0 = vload3( 0, in + 3 * i );\n" " double3 f0 = vload3( 0, in + 3 * i );\n"
" int3 i0 = ", name, "( f0 );\n" " int3 i0 = ",
name,
"( f0 );\n"
" vstore3( i0, 0, out + 3*i );\n" " vstore3( i0, 0, out + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" double3 f0;\n" " double3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
@@ -125,7 +156,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" int3 i0 = ", name, "( f0 );\n" " int3 i0 = ",
name,
"( f0 );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -150,7 +183,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -164,8 +198,10 @@ typedef struct BuildKernelInfo
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
} BuildKernelInfo; } BuildKernelInfo;
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -173,8 +209,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
info->programs + i, info->relaxedMode); info->programs + i, info->relaxedMode);
} }
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -206,12 +244,14 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
// Init the kernels // Init the kernels
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
f->nameInCode, relaxedMode }; f->nameInCode, relaxedMode };
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) if ((error = ThreadPool_Do(BuildKernel_FloatFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
return error; return error;
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
return error; programs + i) ) ) return error;
*/ */
for (i = 0; i < (1ULL << 32); i += step) for (i = 0; i < (1ULL << 32); i += step)
@@ -228,7 +268,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
p[j] = (uint32_t)i + j; p[j] = (uint32_t)i + j;
} }
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -239,9 +280,12 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -251,10 +295,22 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -262,8 +318,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
int *r = (int *)gOut_Ref; int *r = (int *)gOut_Ref;
@@ -274,15 +329,16 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
} }
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) break;
break;
// Verify data // Verify data
uint32_t *t = (uint32_t *)gOut_Ref; uint32_t *t = (uint32_t *)gOut_Ref;
@@ -298,14 +354,15 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
unsigned int correct0 = f->func.i_f(0.0); unsigned int correct0 = f->func.i_f(0.0);
unsigned int correct1 = f->func.i_f(-0.0); unsigned int correct1 = f->func.i_f(-0.0);
if( q[j] == correct0 || q[j] == correct1 ) if (q[j] == correct0 || q[j] == correct1) continue;
continue;
} }
uint32_t err = t[j] - q[j]; uint32_t err = t[j] - q[j];
if( q[j] > t[j] ) if (q[j] > t[j]) err = q[j] - t[j];
err = q[j] - t[j]; vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
vlog_error( "\nERROR: %s%s: %d ulp error at %a (0x%8.8x): *%d vs. %d\n", f->name, sizeNames[k], err, ((float*) gIn)[j], ((cl_uint*) gIn)[j], t[j], q[j] ); "*%d vs. %d\n",
f->name, sizeNames[k], err, ((float *)gIn)[j],
((cl_uint *)gIn)[j], t[j], q[j]);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -316,8 +373,10 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -339,7 +398,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
uint32_t *p = (uint32_t *)gIn; uint32_t *p = (uint32_t *)gIn;
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
p[j] = genrand_int32(d); p[j] = genrand_int32(d);
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -351,15 +411,27 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (k = 0; k < PERF_LOOP_COUNT; k++) for (k = 0; k < PERF_LOOP_COUNT; k++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -375,14 +447,15 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(float));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
f->name, sizeNames[j]);
} }
} }
@@ -431,8 +504,8 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
} }
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
return error; i, programs + i) ) ) return error;
*/ */
for (i = 0; i < (1ULL << 32); i += step) for (i = 0; i < (1ULL << 32); i += step)
@@ -449,7 +522,8 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
for (j = 0; j < bufferSize / sizeof(cl_double); j++) for (j = 0; j < bufferSize / sizeof(cl_double); j++)
p[j] = DoubleFromUInt32((uint32_t)i + j); p[j] = DoubleFromUInt32((uint32_t)i + j);
} }
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -460,9 +534,12 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -472,10 +549,22 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -483,8 +572,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
int *r = (int *)gOut_Ref; int *r = (int *)gOut_Ref;
@@ -495,15 +583,16 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
} }
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) break;
break;
// Verify data // Verify data
uint32_t *t = (uint32_t *)gOut_Ref; uint32_t *t = (uint32_t *)gOut_Ref;
@@ -519,14 +608,15 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
unsigned int correct0 = f->dfunc.i_f(0.0); unsigned int correct0 = f->dfunc.i_f(0.0);
unsigned int correct1 = f->dfunc.i_f(-0.0); unsigned int correct1 = f->dfunc.i_f(-0.0);
if( q[j] == correct0 || q[j] == correct1 ) if (q[j] == correct0 || q[j] == correct1) continue;
continue;
} }
uint32_t err = t[j] - q[j]; uint32_t err = t[j] - q[j];
if( q[j] > t[j] ) if (q[j] > t[j]) err = q[j] - t[j];
err = q[j] - t[j]; vlog_error(
vlog_error( "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", f->name, sizeNames[k], err, ((double*) gIn)[j], t[j], q[j] ); "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n",
f->name, sizeNames[k], err, ((double *)gIn)[j], t[j],
q[j]);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -537,13 +627,14 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
fflush(stdout); fflush(stdout);
} }
} }
@@ -561,7 +652,8 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
double *p = (double *)gIn; double *p = (double *)gIn;
for (j = 0; j < bufferSize / sizeof(cl_double); j++) for (j = 0; j < bufferSize / sizeof(cl_double); j++)
p[j] = DoubleFromUInt32(genrand_int32(d)); p[j] = DoubleFromUInt32(genrand_int32(d));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -573,15 +665,27 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (k = 0; k < PERF_LOOP_COUNT; k++) for (k = 0; k < PERF_LOOP_COUNT; k++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -597,17 +701,17 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(double));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
f->name, sizeNames[j]);
} }
for( ; j < gMaxVectorSizeIndex; j++ ) for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
vlog( "\t -- " );
} }
vlog("\n"); vlog("\n");
@@ -624,4 +728,3 @@ exit:
return error; return error;
} }

File diff suppressed because it is too large Load Diff

View File

@@ -33,24 +33,38 @@ static int BuildKernelDouble(const char *name, int vectorSize,
static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
cl_kernel *k, cl_program *p, bool relaxedMode) cl_kernel *k, cl_program *p, bool relaxedMode)
{ {
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n" const char *c[] = { "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global int",
sizeNames[vectorSize],
"* out, __global float",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i] );\n" " out[i] = ",
"}\n" name,
}; "( in[i] );\n"
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n" "}\n" };
const char *c3[] = {
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global int* out, __global float* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" float3 f0 = vload3( 0, in + 3 * i );\n" " float3 f0 = vload3( 0, in + 3 * i );\n"
" int3 i0 = ", name, "( f0 );\n" " int3 i0 = ",
name,
"( f0 );\n"
" vstore3( i0, 0, out + 3*i );\n" " vstore3( i0, 0, out + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" int3 i0;\n" " int3 i0;\n"
" float3 f0;\n" " float3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
@@ -62,7 +76,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
" f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n" " f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" i0 = ", name, "( f0 );\n" " i0 = ",
name,
"( f0 );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -86,7 +102,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
} }
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
relaxedMode); relaxedMode);
@@ -97,26 +114,40 @@ static int BuildKernelDouble(const char *name, int vectorSize,
bool relaxedMode) bool relaxedMode)
{ {
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n" "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global long",
sizeNames[vectorSize],
"* out, __global double",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i] );\n" " out[i] = ",
"}\n" name,
}; "( in[i] );\n"
"}\n" };
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c3[] = {
"__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in)\n" "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global long* out, __global double* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" double3 d0 = vload3( 0, in + 3 * i );\n" " double3 d0 = vload3( 0, in + 3 * i );\n"
" long3 l0 = ", name, "( d0 );\n" " long3 l0 = ",
name,
"( d0 );\n"
" vstore3( l0, 0, out + 3*i );\n" " vstore3( l0, 0, out + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" double3 d0;\n" " double3 d0;\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
@@ -127,7 +158,9 @@ static int BuildKernelDouble(const char *name, int vectorSize,
" d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" long3 l0 = ", name, "( d0 );\n" " long3 l0 = ",
name,
"( d0 );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -152,7 +185,8 @@ static int BuildKernelDouble(const char *name, int vectorSize,
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
relaxedMode); relaxedMode);
@@ -168,8 +202,10 @@ typedef struct BuildKernelInfo
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
} BuildKernelInfo; } BuildKernelInfo;
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -177,8 +213,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
info->kernels[i], info->programs + i, info->relaxedMode); info->kernels[i], info->programs + i, info->relaxedMode);
} }
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -200,8 +238,11 @@ typedef struct TestInfo
size_t subBufferSize; // Size of the sub-buffer in elements size_t subBufferSize; // Size of the sub-buffer in elements
const Func *f; // A pointer to the function info const Func *f; // A pointer to the function info
cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id] cl_kernel
ThreadInfo *tinfo; // An array of thread specific information for each worker thread *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
// worker thread: k[vector_size][thread_id]
ThreadInfo *
tinfo; // An array of thread specific information for each worker thread
cl_uint threadCount; // Number of worker threads cl_uint threadCount; // Number of worker threads
cl_uint jobCount; // Number of jobs cl_uint jobCount; // Number of jobs
cl_uint step; // step between each chunk and the next. cl_uint step; // step between each chunk and the next.
@@ -223,11 +264,14 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
// Init test_info // Init test_info
memset(&test_info, 0, sizeof(test_info)); memset(&test_info, 0, sizeof(test_info));
test_info.threadCount = GetThreadCount(); test_info.threadCount = GetThreadCount();
test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.subBufferSize = BUFFER_SIZE
/ (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
test_info.scale = getTestScale(sizeof(cl_float)); test_info.scale = getTestScale(sizeof(cl_float));
if (gWimpyMode) if (gWimpyMode)
{ {
test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.subBufferSize = gWimpyBufferSize
/ (sizeof(cl_float)
* RoundUpToNextPowerOfTwo(test_info.threadCount));
} }
test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
@@ -242,8 +286,10 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
} }
test_info.f = f; test_info.f = f;
test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); test_info.ftz =
// cl_kernels aren't thread safe, so we make one for each vector size for every thread f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
// cl_kernels aren't thread safe, so we make one for each vector size for
// every thread
for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
{ {
size_t array_size = test_info.threadCount * sizeof(cl_kernel); size_t array_size = test_info.threadCount * sizeof(cl_kernel);
@@ -256,34 +302,49 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
} }
memset(test_info.k[i], 0, array_size); memset(test_info.k[i], 0, array_size);
} }
test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); test_info.tinfo =
(ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
if (NULL == test_info.tinfo) if (NULL == test_info.tinfo)
{ {
vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); vlog_error(
"Error: Unable to allocate storage for thread specific data.\n");
error = CL_OUT_OF_HOST_MEMORY; error = CL_OUT_OF_HOST_MEMORY;
goto exit; goto exit;
} }
memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); memset(test_info.tinfo, 0,
test_info.threadCount * sizeof(*test_info.tinfo));
for (i = 0; i < test_info.threadCount; i++) for (i = 0; i < test_info.threadCount; i++)
{ {
cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) }; cl_buffer_region region = {
test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error); i * test_info.subBufferSize * sizeof(cl_float),
test_info.subBufferSize * sizeof(cl_float)
};
test_info.tinfo[i].inBuf =
clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
if (error || NULL == test_info.tinfo[i].inBuf) if (error || NULL == test_info.tinfo[i].inBuf)
{ {
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
"region {%zd, %zd}\n",
region.origin, region.size);
goto exit; goto exit;
} }
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error); test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
&region, &error);
if (error || NULL == test_info.tinfo[i].outBuf[j]) if (error || NULL == test_info.tinfo[i].outBuf[j])
{ {
vlog_error( "Error: Unable to create sub-buffer of gOutBuffer for region {%zd, %zd}\n", region.origin, region.size ); vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
"for region {%zd, %zd}\n",
region.origin, region.size);
goto exit; goto exit;
} }
} }
test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); test_info.tinfo[i].tQueue =
clCreateCommandQueue(gContext, gDevice, 0, &error);
if (NULL == test_info.tinfo[i].tQueue || error) if (NULL == test_info.tinfo[i].tQueue || error)
{ {
vlog_error("clCreateCommandQueue failed. (%d)\n", error); vlog_error("clCreateCommandQueue failed. (%d)\n", error);
@@ -297,7 +358,9 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
gMinVectorSizeIndex, test_info.threadCount, test_info.k, gMinVectorSizeIndex, test_info.threadCount, test_info.k,
test_info.programs, f->nameInCode, relaxedMode test_info.programs, f->nameInCode, relaxedMode
}; };
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) if ((error = ThreadPool_Do(BuildKernel_FloatFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
goto exit; goto exit;
} }
@@ -305,8 +368,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
if( error ) if (error) goto exit;
goto exit;
if (gWimpyMode) if (gWimpyMode)
vlog("Wimp pass"); vlog("Wimp pass");
@@ -320,7 +382,8 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
cl_uint *p = (cl_uint *)gIn; cl_uint *p = (cl_uint *)gIn;
for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
p[j] = genrand_int32(d); p[j] = genrand_int32(d);
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
BUFFER_SIZE, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -331,16 +394,29 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
size_t vectorSize = sizeof(cl_float) * sizeValues[j]; size_t vectorSize = sizeof(cl_float) * sizeValues[j];
size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up size_t localCount = (BUFFER_SIZE + vectorSize - 1)
if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } / vectorSize; // BUFFER_SIZE / vectorSize rounded up
if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } if ((error = clSetKernelArg(test_info.k[j][0], 0,
sizeof(gOutBuffer[j]), &gOutBuffer[j])))
{
LogBuildError(test_info.programs[j]);
goto exit;
}
if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(test_info.programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (i = 0; i < PERF_LOOP_COUNT; i++) for (i = 0; i < PERF_LOOP_COUNT; i++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
1, NULL, &localCount, NULL,
0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -356,14 +432,15 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); / (BUFFER_SIZE / sizeof(float));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
f->name, sizeNames[j]);
} }
} }
@@ -413,8 +490,7 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
const char *name = job->f->name; const char *name = job->f->name;
int signbit_test = 0; int signbit_test = 0;
if(!strcmp(name, "signbit")) if (!strcmp(name, "signbit")) signbit_test = 1;
signbit_test = 1;
#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s)) #define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
@@ -423,25 +499,27 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
cl_int *out[VECTOR_SIZE_COUNT]; cl_int *out[VECTOR_SIZE_COUNT];
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); out[j] = (cl_int *)clEnqueueMapBuffer(
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
buffer_size, 0, NULL, e + j, &error);
if (error || NULL == out[j]) if (error || NULL == out[j])
{ {
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
error);
return error; return error;
} }
} }
// Get that moving // Get that moving
if( (error = clFlush(tinfo->tQueue) )) if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Write the new values to the input array // Write the new values to the input array
cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
for( j = 0; j < buffer_elements; j++ ) for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
p[j] = base + j * scale;
if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
buffer_size, p, 0, NULL, NULL)))
{ {
vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
return error; return error;
@@ -461,24 +539,39 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
return error; return error;
} }
// Fill the result buffer with garbage, so that old results don't carry over // Fill the result buffer with garbage, so that old results don't carry
// over
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(out[j], &pattern, buffer_size); memset_pattern4(out[j], &pattern, buffer_size);
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
out[j], 0, NULL, NULL)))
{ {
vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
return error; return error;
} }
// run the kernel // run the kernel
size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; size_t vectorCount =
cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
// own copy of the cl_kernel
cl_program program = job->programs[j]; cl_program program = job->programs[j];
if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } &tinfo->outBuf[j])))
{
LogBuildError(program);
return error;
}
if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
&tinfo->inBuf)))
{
LogBuildError(program);
return error;
}
if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
&vectorCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
return error; return error;
@@ -487,30 +580,33 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
// Get that moving // Get that moving
if( (error = clFlush(tinfo->tQueue) )) if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
vlog( "clFlush 2 failed\n" );
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) return CL_SUCCESS;
return CL_SUCCESS;
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements; cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
float *s = (float *)p; float *s = (float *)p;
for( j = 0; j < buffer_elements; j++ ) for (j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
r[j] = ref_func( s[j] );
// Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. // Read the data back -- no need to wait for the first N-1 buffers. This is
// an in order queue.
for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
{ {
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); out[j] = (cl_int *)clEnqueueMapBuffer(
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
buffer_size, 0, NULL, NULL, &error);
if (error || NULL == out[j]) if (error || NULL == out[j])
{ {
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
error);
return error; return error;
} }
} }
// Wait for the last buffer // Wait for the last buffer
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
CL_TRUE, CL_MAP_READ, 0, buffer_size,
0, NULL, NULL, &error);
if (error || NULL == out[j]) if (error || NULL == out[j])
{ {
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
@@ -535,15 +631,14 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
{ {
int correct = ref_func(+0.0f); int correct = ref_func(+0.0f);
int correct2 = ref_func(-0.0f); int correct2 = ref_func(-0.0f);
if( correct == q[j] || correct2 == q[j] ) if (correct == q[j] || correct2 == q[j]) continue;
continue;
} }
} }
uint32_t err = t[j] - q[j]; uint32_t err = t[j] - q[j];
if( q[j] > t[j] ) if (q[j] > t[j]) err = q[j] - t[j];
err = q[j] - t[j]; vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n",
vlog_error( "\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", name, err, ((float*) s)[j], t[j], q[j] ); name, err, ((float *)s)[j], t[j], q[j]);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -561,15 +656,15 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
{ {
int correct = -ref_func(+0.0f); int correct = -ref_func(+0.0f);
int correct2 = -ref_func(-0.0f); int correct2 = -ref_func(-0.0f);
if( correct == q[j] || correct2 == q[j] ) if (correct == q[j] || correct2 == q[j]) continue;
continue;
} }
} }
uint32_t err = -t[j] - q[j]; uint32_t err = -t[j] - q[j];
if( q[j] > -t[j] ) if (q[j] > -t[j]) err = q[j] + t[j];
err = q[j] + t[j]; vlog_error(
vlog_error( "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, sizeNames[k], err, ((float*) s)[j], -t[j], q[j] ); "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name,
sizeNames[k], err, ((float *)s)[j], -t[j], q[j]);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -581,9 +676,11 @@ exit:
ret = error; ret = error;
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
out[j], 0, NULL, NULL)))
{ {
vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
j, error);
return error; return error;
} }
} }
@@ -599,8 +696,12 @@ exit:
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount); vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
} else "ThreadCount:%2u\n",
base, job->step, job->scale, buffer_elements,
job->threadCount);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -622,11 +723,14 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
// Init test_info // Init test_info
memset(&test_info, 0, sizeof(test_info)); memset(&test_info, 0, sizeof(test_info));
test_info.threadCount = GetThreadCount(); test_info.threadCount = GetThreadCount();
test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.subBufferSize = BUFFER_SIZE
/ (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
test_info.scale = getTestScale(sizeof(cl_double)); test_info.scale = getTestScale(sizeof(cl_double));
if (gWimpyMode) if (gWimpyMode)
{ {
test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.subBufferSize = gWimpyBufferSize
/ (sizeof(cl_double)
* RoundUpToNextPowerOfTwo(test_info.threadCount));
} }
test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
@@ -643,7 +747,8 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
test_info.f = f; test_info.f = f;
test_info.ftz = f->ftz || gForceFTZ; test_info.ftz = f->ftz || gForceFTZ;
// cl_kernels aren't thread safe, so we make one for each vector size for every thread // cl_kernels aren't thread safe, so we make one for each vector size for
// every thread
for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
{ {
size_t array_size = test_info.threadCount * sizeof(cl_kernel); size_t array_size = test_info.threadCount * sizeof(cl_kernel);
@@ -656,36 +761,52 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
} }
memset(test_info.k[i], 0, array_size); memset(test_info.k[i], 0, array_size);
} }
test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); test_info.tinfo =
(ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
if (NULL == test_info.tinfo) if (NULL == test_info.tinfo)
{ {
vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); vlog_error(
"Error: Unable to allocate storage for thread specific data.\n");
error = CL_OUT_OF_HOST_MEMORY; error = CL_OUT_OF_HOST_MEMORY;
goto exit; goto exit;
} }
memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); memset(test_info.tinfo, 0,
test_info.threadCount * sizeof(*test_info.tinfo));
for (i = 0; i < test_info.threadCount; i++) for (i = 0; i < test_info.threadCount; i++)
{ {
cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) }; cl_buffer_region region = {
test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error); i * test_info.subBufferSize * sizeof(cl_double),
test_info.subBufferSize * sizeof(cl_double)
};
test_info.tinfo[i].inBuf =
clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
if (error || NULL == test_info.tinfo[i].inBuf) if (error || NULL == test_info.tinfo[i].inBuf)
{ {
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
"region {%zd, %zd}\n",
region.origin, region.size);
goto exit; goto exit;
} }
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
/* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */ /* Qualcomm fix: 9461 read-write flags must be compatible with
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error); * parent buffer */
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
&region, &error);
/* Qualcomm fix: end */ /* Qualcomm fix: end */
if (error || NULL == test_info.tinfo[i].outBuf[j]) if (error || NULL == test_info.tinfo[i].outBuf[j])
{ {
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); vlog_error("Error: Unable to create sub-buffer of gInBuffer "
"for region {%zd, %zd}\n",
region.origin, region.size);
goto exit; goto exit;
} }
} }
test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); test_info.tinfo[i].tQueue =
clCreateCommandQueue(gContext, gDevice, 0, &error);
if (NULL == test_info.tinfo[i].tQueue || error) if (NULL == test_info.tinfo[i].tQueue || error)
{ {
vlog_error("clCreateCommandQueue failed. (%d)\n", error); vlog_error("clCreateCommandQueue failed. (%d)\n", error);
@@ -699,7 +820,9 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
gMinVectorSizeIndex, test_info.threadCount, test_info.k, gMinVectorSizeIndex, test_info.threadCount, test_info.k,
test_info.programs, f->nameInCode, relaxedMode test_info.programs, f->nameInCode, relaxedMode
}; };
if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
goto exit; goto exit;
} }
@@ -707,8 +830,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
if( error ) if (error) goto exit;
goto exit;
if (gWimpyMode) if (gWimpyMode)
vlog("Wimp pass"); vlog("Wimp pass");
@@ -722,7 +844,8 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
cl_ulong *p = (cl_ulong *)gIn; cl_ulong *p = (cl_ulong *)gIn;
for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
p[j] = DoubleFromUInt32(genrand_int32(d)); p[j] = DoubleFromUInt32(genrand_int32(d));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
BUFFER_SIZE, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -734,15 +857,27 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } if ((error = clSetKernelArg(test_info.k[j][0], 0,
if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } sizeof(gOutBuffer[j]), &gOutBuffer[j])))
{
LogBuildError(test_info.programs[j]);
goto exit;
}
if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(test_info.programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (i = 0; i < PERF_LOOP_COUNT; i++) for (i = 0; i < PERF_LOOP_COUNT; i++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
1, NULL, &localCount, NULL,
0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -758,17 +893,17 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); / (BUFFER_SIZE / sizeof(double));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
f->name, sizeNames[j]);
} }
for( ; j < gMaxVectorSizeIndex; j++ ) for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
vlog( "\t -- " );
} }
vlog("\n"); vlog("\n");
@@ -822,24 +957,27 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
cl_long *out[VECTOR_SIZE_COUNT]; cl_long *out[VECTOR_SIZE_COUNT];
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); out[j] = (cl_long *)clEnqueueMapBuffer(
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
buffer_size, 0, NULL, e + j, &error);
if (error || NULL == out[j]) if (error || NULL == out[j])
{ {
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
error);
return error; return error;
} }
} }
// Get that moving // Get that moving
if( (error = clFlush(tinfo->tQueue) )) if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Write the new values to the input array // Write the new values to the input array
cl_double *p = (cl_double *)gIn + thread_id * buffer_elements; cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
for (j = 0; j < buffer_elements; j++) for (j = 0; j < buffer_elements; j++)
p[j] = DoubleFromUInt32(base + j * scale); p[j] = DoubleFromUInt32(base + j * scale);
if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
buffer_size, p, 0, NULL, NULL)))
{ {
vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
return error; return error;
@@ -859,24 +997,39 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
return error; return error;
} }
// Fill the result buffer with garbage, so that old results don't carry over // Fill the result buffer with garbage, so that old results don't carry
// over
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(out[j], &pattern, buffer_size); memset_pattern4(out[j], &pattern, buffer_size);
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
out[j], 0, NULL, NULL)))
{ {
vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
return error; return error;
} }
// run the kernel // run the kernel
size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; size_t vectorCount =
cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
// own copy of the cl_kernel
cl_program program = job->programs[j]; cl_program program = job->programs[j];
if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } &tinfo->outBuf[j])))
{
LogBuildError(program);
return error;
}
if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
&tinfo->inBuf)))
{
LogBuildError(program);
return error;
}
if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
&vectorCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
return error; return error;
@@ -885,30 +1038,33 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
// Get that moving // Get that moving
if( (error = clFlush(tinfo->tQueue) )) if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
vlog( "clFlush 2 failed\n" );
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) return CL_SUCCESS;
return CL_SUCCESS;
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements; cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
cl_double *s = (cl_double *)p; cl_double *s = (cl_double *)p;
for( j = 0; j < buffer_elements; j++ ) for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
r[j] = dfunc.i_f( s[j] );
// Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. // Read the data back -- no need to wait for the first N-1 buffers. This is
// an in order queue.
for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
{ {
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); out[j] = (cl_long *)clEnqueueMapBuffer(
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
buffer_size, 0, NULL, NULL, &error);
if (error || NULL == out[j]) if (error || NULL == out[j])
{ {
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
error);
return error; return error;
} }
} }
// Wait for the last buffer // Wait for the last buffer
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
CL_TRUE, CL_MAP_READ, 0, buffer_size,
0, NULL, NULL, &error);
if (error || NULL == out[j]) if (error || NULL == out[j])
{ {
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
@@ -933,15 +1089,14 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
{ {
cl_long correct = dfunc.i_f(+0.0f); cl_long correct = dfunc.i_f(+0.0f);
cl_long correct2 = dfunc.i_f(-0.0f); cl_long correct2 = dfunc.i_f(-0.0f);
if( correct == q[j] || correct2 == q[j] ) if (correct == q[j] || correct2 == q[j]) continue;
continue;
} }
} }
cl_ulong err = t[j] - q[j]; cl_ulong err = t[j] - q[j];
if( q[j] > t[j] ) if (q[j] > t[j]) err = q[j] - t[j];
err = q[j] - t[j]; vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
vlog_error( "\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", name, err, ((double*) gIn)[j], t[j], q[j] ); name, err, ((double *)gIn)[j], t[j], q[j]);
return -1; return -1;
} }
@@ -958,40 +1113,44 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
{ {
int64_t correct = -dfunc.i_f(+0.0f); int64_t correct = -dfunc.i_f(+0.0f);
int64_t correct2 = -dfunc.i_f(-0.0f); int64_t correct2 = -dfunc.i_f(-0.0f);
if( correct == q[j] || correct2 == q[j] ) if (correct == q[j] || correct2 == q[j]) continue;
continue;
} }
} }
cl_ulong err = -t[j] - q[j]; cl_ulong err = -t[j] - q[j];
if( q[j] > -t[j] ) if (q[j] > -t[j]) err = q[j] + t[j];
err = q[j] + t[j]; vlog_error(
vlog_error( "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", name, sizeNames[k], err, ((double*) gIn)[j], -t[j], q[j] ); "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
return -1; return -1;
} }
} }
} }
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
out[j], 0, NULL, NULL)))
{ {
vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
j, error);
return error; return error;
} }
} }
if( (error = clFlush(tinfo->tQueue) )) if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
vlog( "clFlush 3 failed\n" );
if (0 == (base & 0x0fffffff)) if (0 == (base & 0x0fffffff))
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount); vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
} else "ThreadCount:%2u\n",
base, job->step, job->scale, buffer_elements,
job->threadCount);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -1000,7 +1159,3 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
return CL_SUCCESS; return CL_SUCCESS;
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -234,5 +234,3 @@ long double reference_assignmentl( long double x );
int reference_notl(long double x); int reference_notl(long double x);
#endif #endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -32,27 +32,43 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n" const char *c[] = { "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global float",
sizeNames[vectorSize],
"* out, __global float",
sizeNames[vectorSize],
"* out2, __global float",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i], out2 + i );\n" " out[i] = ",
"}\n" name,
}; "( in[i], out2 + i );\n"
"}\n" };
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* out2, __global float* in)\n" const char *c3[] = {
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global float* out, __global float* out2, __global float* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" float3 f0 = vload3( 0, in + 3 * i );\n" " float3 f0 = vload3( 0, in + 3 * i );\n"
" float3 iout = NAN;\n" " float3 iout = NAN;\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" vstore3( f0, 0, out + 3*i );\n" " vstore3( f0, 0, out + 3*i );\n"
" vstore3( iout, 0, out2 + 3*i );\n" " vstore3( iout, 0, out2 + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" float3 iout = NAN;\n" " float3 iout = NAN;\n"
" float3 f0;\n" " float3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
@@ -64,7 +80,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -89,7 +107,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
} }
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -98,28 +117,44 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n" "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global double",
sizeNames[vectorSize],
"* out, __global double",
sizeNames[vectorSize],
"* out2, __global double",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i], out2 + i );\n" " out[i] = ",
"}\n" name,
}; "( in[i], out2 + i );\n"
"}\n" };
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c3[] = {
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* out2, __global double* in)\n" "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global double* out, __global double* out2, __global double* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" double3 f0 = vload3( 0, in + 3 * i );\n" " double3 f0 = vload3( 0, in + 3 * i );\n"
" double3 iout = NAN;\n" " double3 iout = NAN;\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" vstore3( f0, 0, out + 3*i );\n" " vstore3( f0, 0, out + 3*i );\n"
" vstore3( iout, 0, out2 + 3*i );\n" " vstore3( iout, 0, out2 + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" double3 iout = NAN;\n" " double3 iout = NAN;\n"
" double3 f0;\n" " double3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
@@ -131,7 +166,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -156,7 +193,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
} }
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -170,8 +208,10 @@ typedef struct BuildKernelInfo
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
} BuildKernelInfo; } BuildKernelInfo;
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -179,8 +219,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
info->programs + i, info->relaxedMode); info->programs + i, info->relaxedMode);
} }
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -215,12 +257,14 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
// Init the kernels // Init the kernels
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
f->nameInCode, relaxedMode }; f->nameInCode, relaxedMode };
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) if ((error = ThreadPool_Do(BuildKernel_FloatFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
return error; return error;
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
return error; programs + i) ) ) return error;
*/ */
for (i = 0; i < (1ULL << 32); i += step) for (i = 0; i < (1ULL << 32); i += step)
@@ -251,7 +295,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
} }
} }
} }
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -262,16 +307,22 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
memset_pattern4(gOut2[j], &pattern, bufferSize); memset_pattern4(gOut2[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL))) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -281,11 +332,28 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -293,8 +361,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
FPU_mode_type oldMode; FPU_mode_type oldMode;
RoundingMode oldRoundMode = kRoundToNearestEven; RoundingMode oldRoundMode = kRoundToNearestEven;
@@ -302,8 +369,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
memset(&oldMode, 0, sizeof(oldMode)); memset(&oldMode, 0, sizeof(oldMode));
if( ftz ) if (ftz) ForceFTZ(&oldMode);
ForceFTZ( &oldMode );
// Set the rounding mode to match the device // Set the rounding mode to match the device
if (gIsInRTZMode) if (gIsInRTZMode)
@@ -328,7 +394,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
r[j] = (float)f->func.f_fpf(s[j], &dd); r[j] = (float)f->func.f_fpf(s[j], &dd);
r2[j] = (float)dd; r2[j] = (float)dd;
overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); overflow[j] =
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
} }
} }
else else
@@ -345,18 +412,21 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
} }
} }
if( isFract && ftz ) if (isFract && ftz) RestoreFPState(&oldMode);
RestoreFPState( &oldMode );
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray2 failed %d\n", error); vlog_error("ReadArray2 failed %d\n", error);
goto exit; goto exit;
@@ -365,8 +435,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
if (gSkipCorrectnessTesting) if (gSkipCorrectnessTesting)
{ {
if (isFract && gIsInRTZMode) if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
(void)set_round(oldRoundMode, kfloat);
break; break;
} }
@@ -393,26 +462,31 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
else else
correct = f->func.f_fpf(s[j], &correct2); correct = f->func.f_fpf(s[j], &correct2);
// Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow // Per section 10 paragraph 6, accept any result if an input
// or output is a infinity or NaN or overflow
if (relaxedMode || skipNanInf) if (relaxedMode || skipNanInf)
{ {
if (skipNanInf && overflow[j]) if (skipNanInf && overflow[j]) continue;
continue;
// Note: no double rounding here. Reference functions calculate in single precision. // Note: no double rounding here. Reference functions
if( IsFloatInfinity(correct) || IsFloatNaN(correct) || // calculate in single precision.
IsFloatInfinity(correct2)|| IsFloatNaN(correct2) || if (IsFloatInfinity(correct) || IsFloatNaN(correct)
IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) ) || IsFloatInfinity(correct2) || IsFloatNaN(correct2)
|| IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
continue; continue;
} }
typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold. typedef int (*CheckForSubnormal)(
double, float); // If we are in fast relaxed math, we
// have a different calculation for the
// subnormal threshold.
CheckForSubnormal isFloatResultSubnormalPtr; CheckForSubnormal isFloatResultSubnormalPtr;
if (relaxedMode) if (relaxedMode)
{ {
err = Abs_Error(test, correct); err = Abs_Error(test, correct);
err2 = Abs_Error(test2, correct2); err2 = Abs_Error(test2, correct2);
isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError; isFloatResultSubnormalPtr =
&IsFloatResultSubnormalAbsError;
} }
else else
{ {
@@ -420,14 +494,16 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
err2 = Ulp_Error(test2, correct2); err2 = Ulp_Error(test2, correct2);
isFloatResultSubnormalPtr = &IsFloatResultSubnormal; isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
} }
int fail = ! (fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps); int fail = !(fabsf(err) <= float_ulps
&& fabsf(err2) <= float_ulps);
if (ftz) if (ftz)
{ {
// retry per section 6.5.3.2 // retry per section 6.5.3.2
if ((*isFloatResultSubnormalPtr)(correct, float_ulps)) if ((*isFloatResultSubnormalPtr)(correct, float_ulps))
{ {
if( (*isFloatResultSubnormalPtr) (correct2, float_ulps )) if ((*isFloatResultSubnormalPtr)(correct2,
float_ulps))
{ {
fail = fail && !(test == 0.0f && test2 == 0.0f); fail = fail && !(test == 0.0f && test2 == 0.0f);
if (!fail) if (!fail)
@@ -438,16 +514,18 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
} }
else else
{ {
fail = fail && ! ( test == 0.0f && fabsf(err2) <= float_ulps); fail = fail
if( ! fail ) && !(test == 0.0f
err = 0.0f; && fabsf(err2) <= float_ulps);
if (!fail) err = 0.0f;
} }
} }
else if( (*isFloatResultSubnormalPtr)(correct2, float_ulps ) ) else if ((*isFloatResultSubnormalPtr)(correct2,
float_ulps))
{ {
fail = fail && ! ( test2 == 0.0f && fabsf(err) <= float_ulps); fail = fail
if( ! fail ) && !(test2 == 0.0f && fabsf(err) <= float_ulps);
err2 = 0.0f; if (!fail) err2 = 0.0f;
} }
@@ -458,8 +536,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
double correct2p, correct2n; double correct2p, correct2n;
float errp, err2p, errn, err2n; float errp, err2p, errn, err2n;
if( skipNanInf ) if (skipNanInf) feclearexcept(FE_OVERFLOW);
feclearexcept(FE_OVERFLOW);
if (relaxedMode) if (relaxedMode)
{ {
correctp = f->rfunc.f_fpf(0.0, &correct2p); correctp = f->rfunc.f_fpf(0.0, &correct2p);
@@ -471,17 +548,23 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
correctn = f->func.f_fpf(-0.0, &correct2n); correctn = f->func.f_fpf(-0.0, &correct2n);
} }
// Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow // Per section 10 paragraph 6, accept any result if
// an input or output is a infinity or NaN or
// overflow
if (skipNanInf) if (skipNanInf)
{ {
if( fetestexcept(FE_OVERFLOW) ) if (fetestexcept(FE_OVERFLOW)) continue;
continue;
// Note: no double rounding here. Reference functions calculate in single precision. // Note: no double rounding here. Reference
if( IsFloatInfinity(correctp) || IsFloatNaN(correctp) || // functions calculate in single precision.
IsFloatInfinity(correctn) || IsFloatNaN(correctn) || if (IsFloatInfinity(correctp)
IsFloatInfinity(correct2p) || IsFloatNaN(correct2p) || || IsFloatNaN(correctp)
IsFloatInfinity(correct2n) || IsFloatNaN(correct2n) ) || IsFloatInfinity(correctn)
|| IsFloatNaN(correctn)
|| IsFloatInfinity(correct2p)
|| IsFloatNaN(correct2p)
|| IsFloatInfinity(correct2n)
|| IsFloatNaN(correct2n))
continue; continue;
} }
@@ -500,38 +583,48 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
err2n = Ulp_Error(test, correct2n); err2n = Ulp_Error(test, correct2n);
} }
fail = fail && ((!(fabsf(errp) <= float_ulps)) && (!(fabsf(err2p) <= float_ulps)) && fail = fail
((!(fabsf(errn) <= float_ulps)) && (!(fabsf(err2n) <= float_ulps))) ); && ((!(fabsf(errp) <= float_ulps))
if( fabsf( errp ) < fabsf(err ) ) && (!(fabsf(err2p) <= float_ulps))
err = errp; && ((!(fabsf(errn) <= float_ulps))
if( fabsf( errn ) < fabsf(err ) ) && (!(fabsf(err2n) <= float_ulps))));
err = errn; if (fabsf(errp) < fabsf(err)) err = errp;
if( fabsf( err2p ) < fabsf(err2 ) ) if (fabsf(errn) < fabsf(err)) err = errn;
err2 = err2p; if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
if( fabsf( err2n ) < fabsf(err2 ) ) if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
err2 = err2n;
// retry per section 6.5.3.4 // retry per section 6.5.3.4
if( (*isFloatResultSubnormalPtr)( correctp, float_ulps ) || (*isFloatResultSubnormalPtr)( correctn, float_ulps ) ) if ((*isFloatResultSubnormalPtr)(correctp,
float_ulps)
|| (*isFloatResultSubnormalPtr)(correctn,
float_ulps))
{ {
if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) ) if ((*isFloatResultSubnormalPtr)(correct2p,
float_ulps)
|| (*isFloatResultSubnormalPtr)(correct2n,
float_ulps))
{ {
fail = fail && !( test == 0.0f && test2 == 0.0f); fail = fail
if( ! fail ) && !(test == 0.0f && test2 == 0.0f);
err = err2 = 0.0f; if (!fail) err = err2 = 0.0f;
} }
else else
{ {
fail = fail && ! (test == 0.0f && fabsf(err2) <= float_ulps); fail = fail
if( ! fail ) && !(test == 0.0f
err = 0.0f; && fabsf(err2) <= float_ulps);
if (!fail) err = 0.0f;
} }
} }
else if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) ) else if ((*isFloatResultSubnormalPtr)(correct2p,
float_ulps)
|| (*isFloatResultSubnormalPtr)(
correct2n, float_ulps))
{ {
fail = fail && ! (test2 == 0.0f && (fabsf(err) <= float_ulps)); fail = fail
if( ! fail ) && !(test2 == 0.0f
err2 = 0.0f; && (fabsf(err) <= float_ulps));
if (!fail) err2 = 0.0f;
} }
} }
} }
@@ -547,7 +640,11 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
} }
if (fail) if (fail)
{ {
vlog_error( "\nERROR: %s%s: {%f, %f} ulp error at %a: *{%a, %a} vs. {%a, %a}\n", f->name, sizeNames[k], err, err2, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((float*) gOut_Ref2)[j], test, test2 ); vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: "
"*{%a, %a} vs. {%a, %a}\n",
f->name, sizeNames[k], err, err2,
((float *)gIn)[j], ((float *)gOut_Ref)[j],
((float *)gOut_Ref2)[j], test, test2);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -555,15 +652,16 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
} }
} }
if (isFract && gIsInRTZMode) if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
(void)set_round(oldRoundMode, kfloat);
if (0 == (i & 0x0fffffff)) if (0 == (i & 0x0fffffff))
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -585,7 +683,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
uint32_t *p = (uint32_t *)gIn; uint32_t *p = (uint32_t *)gIn;
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
p[j] = genrand_int32(d); p[j] = genrand_int32(d);
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -597,9 +696,24 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
@@ -607,7 +721,9 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -623,19 +739,21 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(float));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
f->name, sizeNames[j]);
} }
} }
if (!gSkipCorrectnessTesting) if (!gSkipCorrectnessTesting)
vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 ); vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
maxErrorVal1);
vlog("\n"); vlog("\n");
exit: exit:
@@ -680,8 +798,8 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
} }
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
return error; i, programs + i) ) ) return error;
*/ */
for (i = 0; i < (1ULL << 32); i += step) for (i = 0; i < (1ULL << 32); i += step)
@@ -698,7 +816,8 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
for (j = 0; j < bufferSize / sizeof(cl_double); j++) for (j = 0; j < bufferSize / sizeof(cl_double); j++)
p[j] = DoubleFromUInt32((uint32_t)i + j); p[j] = DoubleFromUInt32((uint32_t)i + j);
} }
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -709,16 +828,22 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
memset_pattern4(gOut2[j], &pattern, bufferSize); memset_pattern4(gOut2[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL))) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -728,11 +853,28 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -740,8 +882,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
double *r = (double *)gOut_Ref; double *r = (double *)gOut_Ref;
@@ -757,20 +898,23 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray2 failed %d\n", error); vlog_error("ReadArray2 failed %d\n", error);
goto exit; goto exit;
} }
} }
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) break;
break;
// Verify data // Verify data
uint64_t *t = (uint64_t *)gOut_Ref; uint64_t *t = (uint64_t *)gOut_Ref;
@@ -791,13 +935,15 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
long double correct = f->dfunc.f_fpf(s[j], &correct2); long double correct = f->dfunc.f_fpf(s[j], &correct2);
float err = Bruteforce_Ulp_Error_Double(test, correct); float err = Bruteforce_Ulp_Error_Double(test, correct);
float err2 = Bruteforce_Ulp_Error_Double(test2, correct2); float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
int fail = ! (fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps); int fail = !(fabsf(err) <= f->double_ulps
&& fabsf(err2) <= f->double_ulps);
if (ftz) if (ftz)
{ {
// retry per section 6.5.3.2 // retry per section 6.5.3.2
if (IsDoubleResultSubnormal(correct, f->double_ulps)) if (IsDoubleResultSubnormal(correct, f->double_ulps))
{ {
if( IsDoubleResultSubnormal( correct2, f->double_ulps ) ) if (IsDoubleResultSubnormal(correct2,
f->double_ulps))
{ {
fail = fail && !(test == 0.0f && test2 == 0.0f); fail = fail && !(test == 0.0f && test2 == 0.0f);
if (!fail) if (!fail)
@@ -808,60 +954,80 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
} }
else else
{ {
fail = fail && ! ( test == 0.0f && fabsf(err2) <= f->double_ulps); fail = fail
if( ! fail ) && !(test == 0.0f
err = 0.0f; && fabsf(err2) <= f->double_ulps);
if (!fail) err = 0.0f;
} }
} }
else if( IsDoubleResultSubnormal( correct2, f->double_ulps ) ) else if (IsDoubleResultSubnormal(correct2,
f->double_ulps))
{ {
fail = fail && ! ( test2 == 0.0f && fabsf(err) <= f->double_ulps); fail = fail
if( ! fail ) && !(test2 == 0.0f
err2 = 0.0f; && fabsf(err) <= f->double_ulps);
if (!fail) err2 = 0.0f;
} }
// retry per section 6.5.3.3 // retry per section 6.5.3.3
if (IsDoubleSubnormal(s[j])) if (IsDoubleSubnormal(s[j]))
{ {
long double correct2p, correct2n; long double correct2p, correct2n;
long double correctp = f->dfunc.f_fpf( 0.0, &correct2p ); long double correctp =
long double correctn = f->dfunc.f_fpf( -0.0, &correct2n ); f->dfunc.f_fpf(0.0, &correct2p);
float errp = Bruteforce_Ulp_Error_Double( test, correctp ); long double correctn =
float err2p = Bruteforce_Ulp_Error_Double( test, correct2p ); f->dfunc.f_fpf(-0.0, &correct2n);
float errn = Bruteforce_Ulp_Error_Double( test, correctn ); float errp =
float err2n = Bruteforce_Ulp_Error_Double( test, correct2n ); Bruteforce_Ulp_Error_Double(test, correctp);
fail = fail && ((!(fabsf(errp) <= f->double_ulps)) && (!(fabsf(err2p) <= f->double_ulps)) && float err2p =
((!(fabsf(errn) <= f->double_ulps)) && (!(fabsf(err2n) <= f->double_ulps))) ); Bruteforce_Ulp_Error_Double(test, correct2p);
if( fabsf( errp ) < fabsf(err ) ) float errn =
err = errp; Bruteforce_Ulp_Error_Double(test, correctn);
if( fabsf( errn ) < fabsf(err ) ) float err2n =
err = errn; Bruteforce_Ulp_Error_Double(test, correct2n);
if( fabsf( err2p ) < fabsf(err2 ) ) fail = fail
err2 = err2p; && ((!(fabsf(errp) <= f->double_ulps))
if( fabsf( err2n ) < fabsf(err2 ) ) && (!(fabsf(err2p) <= f->double_ulps))
err2 = err2n; && ((!(fabsf(errn) <= f->double_ulps))
&& (!(fabsf(err2n)
<= f->double_ulps))));
if (fabsf(errp) < fabsf(err)) err = errp;
if (fabsf(errn) < fabsf(err)) err = errn;
if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
// retry per section 6.5.3.4 // retry per section 6.5.3.4
if( IsDoubleResultSubnormal( correctp, f->double_ulps ) || IsDoubleResultSubnormal( correctn, f->double_ulps ) ) if (IsDoubleResultSubnormal(correctp,
f->double_ulps)
|| IsDoubleResultSubnormal(correctn,
f->double_ulps))
{ {
if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) ) if (IsDoubleResultSubnormal(correct2p,
f->double_ulps)
|| IsDoubleResultSubnormal(correct2n,
f->double_ulps))
{ {
fail = fail && !( test == 0.0f && test2 == 0.0f); fail = fail
if( ! fail ) && !(test == 0.0f && test2 == 0.0f);
err = err2 = 0.0f; if (!fail) err = err2 = 0.0f;
} }
else else
{ {
fail = fail && ! (test == 0.0f && fabsf(err2) <= f->double_ulps); fail = fail
if( ! fail ) && !(test == 0.0f
err = 0.0f; && fabsf(err2) <= f->double_ulps);
if (!fail) err = 0.0f;
} }
} }
else if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) ) else if (IsDoubleResultSubnormal(correct2p,
f->double_ulps)
|| IsDoubleResultSubnormal(correct2n,
f->double_ulps))
{ {
fail = fail && ! (test2 == 0.0f && (fabsf(err) <= f->double_ulps)); fail = fail
if( ! fail ) && !(test2 == 0.0f
err2 = 0.0f; && (fabsf(err) <= f->double_ulps));
if (!fail) err2 = 0.0f;
} }
} }
} }
@@ -877,7 +1043,12 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
} }
if (fail) if (fail)
{ {
vlog_error( "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: *{%.13la, %.13la} vs. {%.13la, %.13la}\n", f->name, sizeNames[k], err, err2, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((double*) gOut_Ref2)[j], test, test2 ); vlog_error(
"\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
"*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
f->name, sizeNames[k], err, err2,
((double *)gIn)[j], ((double *)gOut_Ref)[j],
((double *)gOut_Ref2)[j], test, test2);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -889,8 +1060,10 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -912,7 +1085,8 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
double *p = (double *)gIn; double *p = (double *)gIn;
for (j = 0; j < bufferSize / sizeof(double); j++) for (j = 0; j < bufferSize / sizeof(double); j++)
p[j] = DoubleFromUInt32(genrand_int32(d)); p[j] = DoubleFromUInt32(genrand_int32(d));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -924,9 +1098,24 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
@@ -934,7 +1123,9 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -950,21 +1141,22 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(double));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
f->name, sizeNames[j]);
} }
for( ; j < gMaxVectorSizeIndex; j++ ) for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
vlog( "\t -- " );
} }
if (!gSkipCorrectnessTesting) if (!gSkipCorrectnessTesting)
vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 ); vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
maxErrorVal1);
vlog("\n"); vlog("\n");
exit: exit:
@@ -977,6 +1169,3 @@ exit:
return error; return error;
} }

View File

@@ -34,26 +34,42 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n" const char *c[] = { "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global float",
sizeNames[vectorSize],
"* out, __global int",
sizeNames[vectorSize],
"* out2, __global float",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i], out2 + i );\n" " out[i] = ",
"}\n" name,
}; "( in[i], out2 + i );\n"
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in)\n" "}\n" };
const char *c3[] = {
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global float* out, __global int* out2, __global float* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" float3 f0 = vload3( 0, in + 3 * i );\n" " float3 f0 = vload3( 0, in + 3 * i );\n"
" int3 iout = INT_MIN;\n" " int3 iout = INT_MIN;\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" vstore3( f0, 0, out + 3*i );\n" " vstore3( f0, 0, out + 3*i );\n"
" vstore3( iout, 0, out2 + 3*i );\n" " vstore3( iout, 0, out2 + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" int3 iout = INT_MIN;\n" " int3 iout = INT_MIN;\n"
" float3 f0;\n" " float3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
@@ -65,7 +81,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -90,7 +108,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
} }
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -99,27 +118,43 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n" "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global double",
sizeNames[vectorSize],
"* out, __global int",
sizeNames[vectorSize],
"* out2, __global double",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i], out2 + i );\n" " out[i] = ",
"}\n" name,
}; "( in[i], out2 + i );\n"
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "}\n" };
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in)\n" const char *c3[] = {
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global double* out, __global int* out2, __global double* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" double3 f0 = vload3( 0, in + 3 * i );\n" " double3 f0 = vload3( 0, in + 3 * i );\n"
" int3 iout = INT_MIN;\n" " int3 iout = INT_MIN;\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" vstore3( f0, 0, out + 3*i );\n" " vstore3( f0, 0, out + 3*i );\n"
" vstore3( iout, 0, out2 + 3*i );\n" " vstore3( iout, 0, out2 + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" int3 iout = INT_MIN;\n" " int3 iout = INT_MIN;\n"
" double3 f0;\n" " double3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
@@ -131,7 +166,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" f0 = ", name, "( f0, &iout );\n" " f0 = ",
name,
"( f0, &iout );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -156,7 +193,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
} }
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -170,8 +208,10 @@ typedef struct BuildKernelInfo
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
} BuildKernelInfo; } BuildKernelInfo;
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -179,8 +219,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
info->programs + i, info->relaxedMode); info->programs + i, info->relaxedMode);
} }
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -225,12 +267,14 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
// Init the kernels // Init the kernels
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
f->nameInCode, relaxedMode }; f->nameInCode, relaxedMode };
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) if ((error = ThreadPool_Do(BuildKernel_FloatFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
return error; return error;
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
return error; programs + i) ) ) return error;
*/ */
for (i = 0; i < (1ULL << 32); i += step) for (i = 0; i < (1ULL << 32); i += step)
@@ -247,7 +291,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
p[j] = (uint32_t)i + j; p[j] = (uint32_t)i + j;
} }
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -258,16 +303,22 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
memset_pattern4(gOut2[j], &pattern, bufferSize); memset_pattern4(gOut2[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -277,11 +328,28 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -289,8 +357,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
float *r = (float *)gOut_Ref; float *r = (float *)gOut_Ref;
@@ -302,20 +369,23 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray2 failed %d\n", error); vlog_error("ReadArray2 failed %d\n", error);
goto exit; goto exit;
} }
} }
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) break;
break;
// Verify data // Verify data
uint32_t *t = (uint32_t *)gOut_Ref; uint32_t *t = (uint32_t *)gOut_Ref;
@@ -335,15 +405,15 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
double correct = f->func.f_fpI(s[j], &correct2); double correct = f->func.f_fpI(s[j], &correct2);
float err = Ulp_Error(test, correct); float err = Ulp_Error(test, correct);
cl_long iErr = (int64_t)q2[j] - (int64_t)correct2; cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
int fail = ! (fabsf(err) <= float_ulps && abs_cl_long( iErr ) <= maxiError ); int fail = !(fabsf(err) <= float_ulps
&& abs_cl_long(iErr) <= maxiError);
if (ftz) if (ftz)
{ {
// retry per section 6.5.3.2 // retry per section 6.5.3.2
if (IsFloatResultSubnormal(correct, float_ulps)) if (IsFloatResultSubnormal(correct, float_ulps))
{ {
fail = fail && !(test == 0.0f && iErr == 0); fail = fail && !(test == 0.0f && iErr == 0);
if( ! fail ) if (!fail) err = 0.0f;
err = 0.0f;
} }
// retry per section 6.5.3.3 // retry per section 6.5.3.3
@@ -354,18 +424,22 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
double correct4 = f->func.f_fpI(-0.0, &correct6); double correct4 = f->func.f_fpI(-0.0, &correct6);
float err2 = Ulp_Error(test, correct3); float err2 = Ulp_Error(test, correct3);
float err3 = Ulp_Error(test, correct4); float err3 = Ulp_Error(test, correct4);
cl_long iErr2 = (long long) q2[j] - (long long) correct5; cl_long iErr2 =
cl_long iErr3 = (long long) q2[j] - (long long) correct6; (long long)q2[j] - (long long)correct5;
cl_long iErr3 =
(long long)q2[j] - (long long)correct6;
// Did +0 work? // Did +0 work?
if( fabsf(err2) <= float_ulps && abs_cl_long( iErr2 ) <= maxiError ) if (fabsf(err2) <= float_ulps
&& abs_cl_long(iErr2) <= maxiError)
{ {
err = err2; err = err2;
iErr = iErr2; iErr = iErr2;
fail = 0; fail = 0;
} }
// Did -0 work? // Did -0 work?
else if(fabsf(err3) <= float_ulps && abs_cl_long( iErr3 ) <= maxiError) else if (fabsf(err3) <= float_ulps
&& abs_cl_long(iErr3) <= maxiError)
{ {
err = err3; err = err3;
iErr = iErr3; iErr = iErr3;
@@ -373,9 +447,16 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
} }
// retry per section 6.5.3.4 // retry per section 6.5.3.4
if( fail && (IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps )) ) if (fail
&& (IsFloatResultSubnormal(correct2, float_ulps)
|| IsFloatResultSubnormal(correct3,
float_ulps)))
{ {
fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) ); fail = fail
&& !(test == 0.0f
&& (abs_cl_long(iErr2) <= maxiError
|| abs_cl_long(iErr3)
<= maxiError));
if (!fail) if (!fail)
{ {
err = 0.0f; err = 0.0f;
@@ -397,7 +478,11 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
if (fail) if (fail)
{ {
vlog_error( "\nERROR: %s%s: {%f, %d} ulp error at %a: *{%a, %d} vs. {%a, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] ); vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
"*{%a, %d} vs. {%a, %d}\n",
f->name, sizeNames[k], err, (int)iErr,
((float *)gIn)[j], ((float *)gOut_Ref)[j],
((int *)gOut_Ref2)[j], test, q2[j]);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -409,8 +494,10 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -432,7 +519,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
uint32_t *p = (uint32_t *)gIn; uint32_t *p = (uint32_t *)gIn;
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
p[j] = genrand_int32(d); p[j] = genrand_int32(d);
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -444,16 +532,33 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (k = 0; k < PERF_LOOP_COUNT; k++) for (k = 0; k < PERF_LOOP_COUNT; k++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -469,14 +574,15 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(float));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
f->name, sizeNames[j]);
} }
} }
@@ -528,8 +634,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
} }
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
return error; i, programs + i) ) ) return error;
*/ */
for (i = 0; i < (1ULL << 32); i += step) for (i = 0; i < (1ULL << 32); i += step)
@@ -546,7 +652,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
for (j = 0; j < bufferSize / sizeof(double); j++) for (j = 0; j < bufferSize / sizeof(double); j++)
p[j] = DoubleFromUInt32((uint32_t)i + j); p[j] = DoubleFromUInt32((uint32_t)i + j);
} }
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -557,16 +664,22 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
memset_pattern4(gOut2[j], &pattern, bufferSize); memset_pattern4(gOut2[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) )) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -576,11 +689,28 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -588,8 +718,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
double *r = (double *)gOut_Ref; double *r = (double *)gOut_Ref;
@@ -601,20 +730,23 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
bufferSize, gOut2[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray2 failed %d\n", error); vlog_error("ReadArray2 failed %d\n", error);
goto exit; goto exit;
} }
} }
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) break;
break;
// Verify data // Verify data
uint64_t *t = (uint64_t *)gOut_Ref; uint64_t *t = (uint64_t *)gOut_Ref;
@@ -634,37 +766,45 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
long double correct = f->dfunc.f_fpI(s[j], &correct2); long double correct = f->dfunc.f_fpI(s[j], &correct2);
float err = Bruteforce_Ulp_Error_Double(test, correct); float err = Bruteforce_Ulp_Error_Double(test, correct);
cl_long iErr = (long long)q2[j] - (long long)correct2; cl_long iErr = (long long)q2[j] - (long long)correct2;
int fail = ! (fabsf(err) <= f->double_ulps && abs_cl_long( iErr ) <= maxiError ); int fail = !(fabsf(err) <= f->double_ulps
&& abs_cl_long(iErr) <= maxiError);
if (ftz) if (ftz)
{ {
// retry per section 6.5.3.2 // retry per section 6.5.3.2
if (IsDoubleResultSubnormal(correct, f->double_ulps)) if (IsDoubleResultSubnormal(correct, f->double_ulps))
{ {
fail = fail && !(test == 0.0f && iErr == 0); fail = fail && !(test == 0.0f && iErr == 0);
if( ! fail ) if (!fail) err = 0.0f;
err = 0.0f;
} }
// retry per section 6.5.3.3 // retry per section 6.5.3.3
if (IsDoubleSubnormal(s[j])) if (IsDoubleSubnormal(s[j]))
{ {
int correct5, correct6; int correct5, correct6;
long double correct3 = f->dfunc.f_fpI( 0.0, &correct5 ); long double correct3 =
long double correct4 = f->dfunc.f_fpI( -0.0, &correct6 ); f->dfunc.f_fpI(0.0, &correct5);
float err2 = Bruteforce_Ulp_Error_Double( test, correct3 ); long double correct4 =
float err3 = Bruteforce_Ulp_Error_Double( test, correct4 ); f->dfunc.f_fpI(-0.0, &correct6);
cl_long iErr2 = (long long) q2[j] - (long long) correct5; float err2 =
cl_long iErr3 = (long long) q2[j] - (long long) correct6; Bruteforce_Ulp_Error_Double(test, correct3);
float err3 =
Bruteforce_Ulp_Error_Double(test, correct4);
cl_long iErr2 =
(long long)q2[j] - (long long)correct5;
cl_long iErr3 =
(long long)q2[j] - (long long)correct6;
// Did +0 work? // Did +0 work?
if( fabsf(err2) <= f->double_ulps && abs_cl_long( iErr2 ) <= maxiError ) if (fabsf(err2) <= f->double_ulps
&& abs_cl_long(iErr2) <= maxiError)
{ {
err = err2; err = err2;
iErr = iErr2; iErr = iErr2;
fail = 0; fail = 0;
} }
// Did -0 work? // Did -0 work?
else if(fabsf(err3) <= f->double_ulps && abs_cl_long( iErr3 ) <= maxiError) else if (fabsf(err3) <= f->double_ulps
&& abs_cl_long(iErr3) <= maxiError)
{ {
err = err3; err = err3;
iErr = iErr3; iErr = iErr3;
@@ -672,9 +812,17 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
} }
// retry per section 6.5.3.4 // retry per section 6.5.3.4
if( fail && (IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )) ) if (fail
&& (IsDoubleResultSubnormal(correct2,
f->double_ulps)
|| IsDoubleResultSubnormal(correct3,
f->double_ulps)))
{ {
fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) ); fail = fail
&& !(test == 0.0f
&& (abs_cl_long(iErr2) <= maxiError
|| abs_cl_long(iErr3)
<= maxiError));
if (!fail) if (!fail)
{ {
err = 0.0f; err = 0.0f;
@@ -696,7 +844,11 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
if (fail) if (fail)
{ {
vlog_error( "\nERROR: %sD%s: {%f, %d} ulp error at %.13la: *{%.13la, %d} vs. {%.13la, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] ); vlog_error("\nERROR: %sD%s: {%f, %d} ulp error at "
"%.13la: *{%.13la, %d} vs. {%.13la, %d}\n",
f->name, sizeNames[k], err, (int)iErr,
((double *)gIn)[j], ((double *)gOut_Ref)[j],
((int *)gOut_Ref2)[j], test, q2[j]);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -708,8 +860,10 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -732,7 +886,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
for (j = 0; j < bufferSize / sizeof(double); j++) for (j = 0; j < bufferSize / sizeof(double); j++)
p[j] = DoubleFromUInt32(genrand_int32(d)); p[j] = DoubleFromUInt32(genrand_int32(d));
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -744,16 +899,33 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } {
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
&gOutBuffer2[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (k = 0; k < PERF_LOOP_COUNT; k++) for (k = 0; k < PERF_LOOP_COUNT; k++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILED -- could not execute kernel\n"); vlog_error("FAILED -- could not execute kernel\n");
goto exit; goto exit;
@@ -769,17 +941,17 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(double));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s",
f->name, sizeNames[j]);
} }
for( ; j < gMaxVectorSizeIndex; j++ ) for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
vlog( "\t -- " );
} }
if (!gSkipCorrectnessTesting) if (!gSkipCorrectnessTesting)
@@ -796,6 +968,3 @@ exit:
return error; return error;
} }

View File

@@ -33,25 +33,38 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { const char *c[] = { "__kernel void math_kernel",
"__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global uint", sizeNames[vectorSize], "* in)\n" sizeNames[vectorSize],
"( __global float",
sizeNames[vectorSize],
"* out, __global uint",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i] );\n" " out[i] = ",
"}\n" name,
}; "( in[i] );\n"
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global uint* in)\n" "}\n" };
const char *c3[] = {
"__kernel void math_kernel",
sizeNames[vectorSize],
"( __global float* out, __global uint* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" uint3 u0 = vload3( 0, in + 3 * i );\n" " uint3 u0 = vload3( 0, in + 3 * i );\n"
" float3 f0 = ", name, "( u0 );\n" " float3 f0 = ",
name,
"( u0 );\n"
" vstore3( f0, 0, out + 3*i );\n" " vstore3( f0, 0, out + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how many elements are "
"left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
"buffer size \n"
" uint3 u0;\n" " uint3 u0;\n"
" float3 f0;\n" " float3 f0;\n"
" switch( parity )\n" " switch( parity )\n"
@@ -63,7 +76,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
" u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n" " u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" f0 = ", name, "( u0 );\n" " f0 = ",
name,
"( u0 );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -87,7 +102,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
} }
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -95,39 +111,55 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
cl_program *p, bool relaxedMode) cl_program *p, bool relaxedMode)
{ {
const char *c[] = { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel",
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global ulong", sizeNames[vectorSize], "* in)\n" sizeNames[vectorSize],
"( __global double",
sizeNames[vectorSize],
"* out, __global ulong",
sizeNames[vectorSize],
"* in)\n"
"{\n" "{\n"
" int i = get_global_id(0);\n" " int i = get_global_id(0);\n"
" out[i] = ", name, "( in[i] );\n" " out[i] = ",
"}\n" name,
}; "( in[i] );\n"
"}\n" };
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global ulong* in)\n" "__kernel void math_kernel",
sizeNames[vectorSize],
"( __global double* out, __global ulong* in)\n"
"{\n" "{\n"
" size_t i = get_global_id(0);\n" " size_t i = get_global_id(0);\n"
" if( i + 1 < get_global_size(0) )\n" " if( i + 1 < get_global_size(0) )\n"
" {\n" " {\n"
" ulong3 u0 = vload3( 0, in + 3 * i );\n" " ulong3 u0 = vload3( 0, in + 3 * i );\n"
" double3 f0 = ", name, "( u0 );\n" " double3 f0 = ",
name,
"( u0 );\n"
" vstore3( f0, 0, out + 3*i );\n" " vstore3( f0, 0, out + 3*i );\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" " size_t parity = i & 1; // Figure out how "
"many elements are left over after BUFFER_SIZE % "
"(3*sizeof(float)). Assume power of two buffer size \n"
" ulong3 u0;\n" " ulong3 u0;\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 1:\n" " case 1:\n"
" u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n" " u0 = (ulong3)( in[3*i], "
"0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
" break;\n" " break;\n"
" case 0:\n" " case 0:\n"
" u0 = (ulong3)( in[3*i], in[3*i+1], 0xdeaddeaddeaddeadUL ); \n" " u0 = (ulong3)( in[3*i], in[3*i+1], "
"0xdeaddeaddeaddeadUL ); \n"
" break;\n" " break;\n"
" }\n" " }\n"
" double3 f0 = ", name, "( u0 );\n" " double3 f0 = ",
name,
"( u0 );\n"
" switch( parity )\n" " switch( parity )\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
@@ -138,8 +170,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
" break;\n" " break;\n"
" }\n" " }\n"
" }\n" " }\n"
"}\n" "}\n" };
};
const char **kern = c; const char **kern = c;
size_t kernSize = sizeof(c) / sizeof(c[0]); size_t kernSize = sizeof(c) / sizeof(c[0]);
@@ -152,7 +183,8 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
char testName[32]; char testName[32];
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
sizeNames[vectorSize]);
return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
} }
@@ -166,8 +198,10 @@ typedef struct BuildKernelInfo
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
} BuildKernelInfo; } BuildKernelInfo;
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -175,8 +209,10 @@ static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, voi
info->programs + i, info->relaxedMode); info->programs + i, info->relaxedMode);
} }
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) void *p);
static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
void *p)
{ {
BuildKernelInfo *info = (BuildKernelInfo *)p; BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id; cl_uint i = info->offset + job_id;
@@ -212,23 +248,29 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
// Init the kernels // Init the kernels
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
f->nameInCode, relaxedMode }; f->nameInCode, relaxedMode };
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) if ((error = ThreadPool_Do(BuildKernel_FloatFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
return error; return error;
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i,
return error; programs + i) ) ) return error;
*/ */
if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
{ {
isRangeLimited = 1; isRangeLimited = 1;
half_sin_cos_tan_limit = 1.0f + float_ulps * (FLT_EPSILON/2.0f); // out of range results from finite inputs must be in [-1,1] half_sin_cos_tan_limit = 1.0f
+ float_ulps
* (FLT_EPSILON / 2.0f); // out of range results from finite
// inputs must be in [-1,1]
} }
else if (0 == strcmp(f->name, "half_tan")) else if (0 == strcmp(f->name, "half_tan"))
{ {
isRangeLimited = 1; isRangeLimited = 1;
half_sin_cos_tan_limit = INFINITY; // out of range resut from finite inputs must be numeric half_sin_cos_tan_limit =
INFINITY; // out of range resut from finite inputs must be numeric
} }
@@ -246,7 +288,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
p[j] = (uint32_t)i + j; p[j] = (uint32_t)i + j;
} }
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL))) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -257,9 +300,12 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL))) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -269,10 +315,22 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL))) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILURE -- could not execute kernel\n"); vlog_error("FAILURE -- could not execute kernel\n");
goto exit; goto exit;
@@ -280,8 +338,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
float *r = (float *)gOut_Ref; float *r = (float *)gOut_Ref;
@@ -292,15 +349,16 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL))) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
} }
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) break;
break;
// Verify data // Verify data
@@ -320,7 +378,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
int fail = !(fabsf(err) <= float_ulps); int fail = !(fabsf(err) <= float_ulps);
// half_sin/cos/tan are only valid between +-2**16, Inf, NaN // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY ) if (isRangeLimited
&& fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
&& fabsf(s[j]) < INFINITY)
{ {
if (fabsf(test) <= half_sin_cos_tan_limit) if (fabsf(test) <= half_sin_cos_tan_limit)
{ {
@@ -337,8 +397,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
if (IsFloatResultSubnormal(correct, float_ulps)) if (IsFloatResultSubnormal(correct, float_ulps))
{ {
fail = fail && (test != 0.0f); fail = fail && (test != 0.0f);
if( ! fail ) if (!fail) err = 0.0f;
err = 0.0f;
} }
} }
} }
@@ -349,7 +408,10 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
} }
if (fail) if (fail)
{ {
vlog_error( "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", f->name, sizeNames[k], err, ((uint32_t*) gIn)[j], ((float*) gOut_Ref)[j], test ); vlog_error(
"\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n",
f->name, sizeNames[k], err, ((uint32_t *)gIn)[j],
((float *)gOut_Ref)[j], test);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -361,8 +423,10 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -383,7 +447,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
{ {
// Init input array // Init input array
uint32_t *p = (uint32_t *)gIn; uint32_t *p = (uint32_t *)gIn;
if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) ) if (strstr(f->name, "exp") || strstr(f->name, "sin")
|| strstr(f->name, "cos") || strstr(f->name, "tan"))
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
((float *)p)[j] = (float)genrand_real1(d); ((float *)p)[j] = (float)genrand_real1(d);
else if (strstr(f->name, "log")) else if (strstr(f->name, "log"))
@@ -392,7 +457,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
else else
for (j = 0; j < bufferSize / sizeof(float); j++) for (j = 0; j < bufferSize / sizeof(float); j++)
p[j] = genrand_int32(d); p[j] = genrand_int32(d);
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -404,15 +470,27 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t vectorSize = sizeValues[j] * sizeof(cl_float);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (k = 0; k < PERF_LOOP_COUNT; k++) for (k = 0; k < PERF_LOOP_COUNT; k++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILURE -- could not execute kernel\n"); vlog_error("FAILURE -- could not execute kernel\n");
goto exit; goto exit;
@@ -428,19 +506,19 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(float));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
f->name, sizeNames[j]);
} }
} }
if( ! gSkipCorrectnessTesting ) if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
vlog("\n"); vlog("\n");
exit: exit:
@@ -487,18 +565,18 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
} }
/* /*
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels +
return error; i, programs + i) ) ) return error;
*/ */
for (i = 0; i < (1ULL << 32); i += step) for (i = 0; i < (1ULL << 32); i += step)
{ {
// Init input array // Init input array
cl_ulong *p = (cl_ulong *)gIn; cl_ulong *p = (cl_ulong *)gIn;
for( j = 0; j < bufferSize / sizeof( cl_ulong ); j++ ) for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
p[j] = random64(d);
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL))) if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -509,9 +587,12 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
{ {
uint32_t pattern = 0xffffdead; uint32_t pattern = 0xffffdead;
memset_pattern4(gOut[j], &pattern, bufferSize); memset_pattern4(gOut[j], &pattern, bufferSize);
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL))) if ((error =
clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
error, j);
goto exit; goto exit;
} }
} }
@@ -521,10 +602,22 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL))) if ((error =
clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL, NULL)))
{ {
vlog_error("FAILURE -- could not execute kernel\n"); vlog_error("FAILURE -- could not execute kernel\n");
goto exit; goto exit;
@@ -532,8 +625,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
} }
// Get that moving // Get that moving
if( (error = clFlush(gQueue) )) if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
vlog( "clFlush failed\n" );
// Calculate the correctly rounded reference result // Calculate the correctly rounded reference result
double *r = (double *)gOut_Ref; double *r = (double *)gOut_Ref;
@@ -544,15 +636,16 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
// Read the data back // Read the data back
for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{ {
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL))) if ((error =
clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
bufferSize, gOut[j], 0, NULL, NULL)))
{ {
vlog_error("ReadArray failed %d\n", error); vlog_error("ReadArray failed %d\n", error);
goto exit; goto exit;
} }
} }
if( gSkipCorrectnessTesting ) if (gSkipCorrectnessTesting) break;
break;
// Verify data // Verify data
@@ -577,11 +670,11 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
if (ftz) if (ftz)
{ {
// retry per section 6.5.3.2 // retry per section 6.5.3.2
if( IsDoubleResultSubnormal(correct, f->double_ulps) ) if (IsDoubleResultSubnormal(correct,
f->double_ulps))
{ {
fail = fail && (test != 0.0); fail = fail && (test != 0.0);
if( ! fail ) if (!fail) err = 0.0f;
err = 0.0f;
} }
} }
} }
@@ -592,7 +685,11 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
} }
if (fail) if (fail)
{ {
vlog_error( "\n%s%sD: %f ulp error at 0x%16.16llx: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, ((uint64_t*) gIn)[j], ((double*) gOut_Ref)[j], test ); vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
"*%.13la vs. %.13la\n",
f->name, sizeNames[k], err,
((uint64_t *)gIn)[j],
((double *)gOut_Ref)[j], test);
error = -1; error = -1;
goto exit; goto exit;
} }
@@ -604,8 +701,10 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
{ {
if (gVerboseBruteForce) if (gVerboseBruteForce)
{ {
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step,
} else bufferSize);
}
else
{ {
vlog("."); vlog(".");
} }
@@ -626,9 +725,9 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
// Init input array // Init input array
double *p = (double *)gIn; double *p = (double *)gIn;
for( j = 0; j < bufferSize / sizeof( double ); j++ ) for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
p[j] = random64(d); if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) bufferSize, gIn, 0, NULL, NULL)))
{ {
vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
return error; return error;
@@ -640,15 +739,27 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
{ {
size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t vectorSize = sizeValues[j] * sizeof(cl_double);
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } &gOutBuffer[j])))
{
LogBuildError(programs[j]);
goto exit;
}
if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
&gInBuffer)))
{
LogBuildError(programs[j]);
goto exit;
}
double sum = 0.0; double sum = 0.0;
double bestTime = INFINITY; double bestTime = INFINITY;
for (k = 0; k < PERF_LOOP_COUNT; k++) for (k = 0; k < PERF_LOOP_COUNT; k++)
{ {
uint64_t startTime = GetTime(); uint64_t startTime = GetTime();
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
&localCount, NULL, 0, NULL,
NULL)))
{ {
vlog_error("FAILURE -- could not execute kernel\n"); vlog_error("FAILURE -- could not execute kernel\n");
goto exit; goto exit;
@@ -664,21 +775,20 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
uint64_t endTime = GetTime(); uint64_t endTime = GetTime();
double time = SubtractTime(endTime, startTime); double time = SubtractTime(endTime, startTime);
sum += time; sum += time;
if( time < bestTime ) if (time < bestTime) bestTime = time;
bestTime = time;
} }
if( gReportAverageTimes ) if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
bestTime = sum / PERF_LOOP_COUNT; double clocksPerOp = bestTime * (double)gDeviceFrequency
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); * gComputeDevices * gSimdSize * 1e6
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); / (bufferSize / sizeof(double));
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
f->name, sizeNames[j]);
} }
for( ; j < gMaxVectorSizeIndex; j++ ) for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- ");
vlog( "\t -- " );
} }
if( ! gSkipCorrectnessTesting ) if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
vlog("\n"); vlog("\n");
exit: exit:
@@ -691,4 +801,3 @@ exit:
return error; return error;
} }