From e5f89249fa2ac24dd8cc57b5d1f022025c9d2819 Mon Sep 17 00:00:00 2001 From: Marco Antognini Date: Thu, 14 Jan 2021 13:27:18 +0000 Subject: [PATCH] Apply clang-format on math_brute_force (#1104) Signed-off-by: Marco Antognini --- .../math_brute_force/FunctionList.cpp | 79 +- .../math_brute_force/FunctionList.h | 103 +- test_conformance/math_brute_force/Sleep.cpp | 139 +- test_conformance/math_brute_force/Sleep.h | 8 +- test_conformance/math_brute_force/Utility.cpp | 96 +- test_conformance/math_brute_force/Utility.h | 217 +- test_conformance/math_brute_force/binary.cpp | 1889 +++--- .../math_brute_force/binaryOperator.cpp | 1809 ++++-- .../math_brute_force/binary_i.cpp | 1555 +++-- .../math_brute_force/binary_two_results_i.cpp | 1395 ++-- test_conformance/math_brute_force/i_unary.cpp | 665 +- .../math_brute_force/macro_binary.cpp | 1412 +++-- .../math_brute_force/macro_unary.cpp | 1063 ++-- test_conformance/math_brute_force/mad.cpp | 1646 ++--- test_conformance/math_brute_force/main.cpp | 1748 ++--- .../math_brute_force/reference_math.cpp | 5625 +++++++++-------- .../math_brute_force/reference_math.h | 378 +- test_conformance/math_brute_force/ternary.cpp | 1703 +++-- test_conformance/math_brute_force/unary.cpp | 1260 ++-- .../math_brute_force/unary_two_results.cpp | 1115 ++-- .../math_brute_force/unary_two_results_i.cpp | 865 ++- test_conformance/math_brute_force/unary_u.cpp | 745 ++- 22 files changed, 14745 insertions(+), 10770 deletions(-) diff --git a/test_conformance/math_brute_force/FunctionList.cpp b/test_conformance/math_brute_force/FunctionList.cpp index a07fa069..c5185c6f 100644 --- a/test_conformance/math_brute_force/FunctionList.cpp +++ b/test_conformance/math_brute_force/FunctionList.cpp @@ -16,13 +16,13 @@ #include "FunctionList.h" #include "reference_math.h" -#define FTZ_ON 1 +#define FTZ_ON 1 #define FTZ_OFF 0 -#define EXACT 0.0f +#define EXACT 0.0f #define RELAXED_ON 1 #define RELAXED_OFF 0 -#define STRINGIFY( _s) #_s +#define STRINGIFY(_s) #_s // Only use ulps information in spir test #ifdef FUNCTION_LIST_ULPS_ONLY @@ -51,25 +51,25 @@ STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } -#define unaryF NULL -#define i_unaryF NULL -#define unaryF_u NULL -#define macro_unaryF NULL -#define binaryF NULL -#define binaryF_nextafter NULL -#define binaryOperatorF NULL -#define binaryF_i NULL -#define macro_binaryF NULL -#define ternaryF NULL -#define unaryF_two_results NULL -#define unaryF_two_results_i NULL +#define unaryF NULL +#define i_unaryF NULL +#define unaryF_u NULL +#define macro_unaryF NULL +#define binaryF NULL +#define binaryF_nextafter NULL +#define binaryOperatorF NULL +#define binaryF_i NULL +#define macro_binaryF NULL +#define ternaryF NULL +#define unaryF_two_results NULL +#define unaryF_two_results_i NULL #define binaryF_two_results_i NULL -#define mad_function NULL +#define mad_function NULL -#define reference_sqrt NULL -#define reference_sqrtl NULL -#define reference_divide NULL -#define reference_dividel NULL +#define reference_sqrt NULL +#define reference_sqrtl NULL +#define reference_divide NULL +#define reference_dividel NULL #define reference_relaxed_divide NULL #else // FUNCTION_LIST_ULPS_ONLY @@ -102,24 +102,27 @@ _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } -extern const vtbl _unary; // float foo( float ) -extern const vtbl _unary_u; // float foo( uint ), double foo( ulong ) -extern const vtbl _i_unary; // int foo( float ) -extern const vtbl _macro_unary; // int foo( float ), returns {0,1} for scalar, { 0, -1 } for vector -extern const vtbl _binary; // float foo( float, float ) -extern const vtbl _binary_nextafter; // float foo( float, float ), special handling for nextafter -extern const vtbl _binary_operator; // float .op. float -extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for scalar, { 0, -1 } for vector -extern const vtbl _binary_i; // float foo( float, int ) -extern const vtbl _ternary; // float foo( float, float, float ) -extern const vtbl _unary_two_results; // float foo( float, float * ) +extern const vtbl _unary; // float foo( float ) +extern const vtbl _unary_u; // float foo( uint ), double foo( ulong ) +extern const vtbl _i_unary; // int foo( float ) +extern const vtbl _macro_unary; // int foo( float ), returns {0,1} for scalar, + // { 0, -1 } for vector +extern const vtbl _binary; // float foo( float, float ) +extern const vtbl _binary_nextafter; // float foo( float, float ), special + // handling for nextafter +extern const vtbl _binary_operator; // float .op. float +extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for + // scalar, { 0, -1 } for vector +extern const vtbl _binary_i; // float foo( float, int ) +extern const vtbl _ternary; // float foo( float, float, float ) +extern const vtbl _unary_two_results; // float foo( float, float * ) extern const vtbl _unary_two_results_i; // float foo( float, int * ) extern const vtbl _binary_two_results_i; // float foo( float, float, int * ) -extern const vtbl _mad_tbl; // float mad( float, float, float ) +extern const vtbl _mad_tbl; // float mad( float, float, float ) #define unaryF &_unary #define i_unaryF &_i_unary -#define unaryF_u &_unary_u +#define unaryF_u &_unary_u #define macro_unaryF &_macro_unary #define binaryF &_binary #define binaryF_nextafter &_binary_nextafter @@ -127,10 +130,10 @@ extern const vtbl _mad_tbl; // float mad( float, float, float ) #define binaryF_i &_binary_i #define macro_binaryF &_macro_binary #define ternaryF &_ternary -#define unaryF_two_results &_unary_two_results -#define unaryF_two_results_i &_unary_two_results_i -#define binaryF_two_results_i &_binary_two_results_i -#define mad_function &_mad_tbl +#define unaryF_two_results &_unary_two_results +#define unaryF_two_results_i &_unary_two_results_i +#define binaryF_two_results_i &_binary_two_results_i +#define mad_function &_mad_tbl #endif // FUNCTION_LIST_ULPS_ONLY @@ -325,4 +328,4 @@ const Func functionList[] = { OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF), }; -const size_t functionListCount = sizeof( functionList ) / sizeof( functionList[0] ); +const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]); diff --git a/test_conformance/math_brute_force/FunctionList.h b/test_conformance/math_brute_force/FunctionList.h index c22bceeb..e47eb729 100644 --- a/test_conformance/math_brute_force/FunctionList.h +++ b/test_conformance/math_brute_force/FunctionList.h @@ -22,80 +22,77 @@ #include #endif -#if defined( __APPLE__ ) - #include +#if defined(__APPLE__) +#include #else - #include +#include #endif #include "harness/mt19937.h" -typedef union fptr -{ - void *p; - double (*f_f)(double); - double (*f_u)(cl_uint); - int (*i_f)(double); - int (*i_f_f)(float); - float (*f_ff_f)(float, float); - double (*f_ff)(double, double); - int (*i_ff)(double, double); - double (*f_fi)(double, int); - double (*f_fpf)(double, double*); - double (*f_fpI)(double, int*); - double (*f_ffpI)(double, double, int*); - double (*f_fff)(double, double, double ); - float (*f_fma)(float, float, float, int); -}fptr; +typedef union fptr { + void *p; + double (*f_f)(double); + double (*f_u)(cl_uint); + int (*i_f)(double); + int (*i_f_f)(float); + float (*f_ff_f)(float, float); + double (*f_ff)(double, double); + int (*i_ff)(double, double); + double (*f_fi)(double, int); + double (*f_fpf)(double, double *); + double (*f_fpI)(double, int *); + double (*f_ffpI)(double, double, int *); + double (*f_fff)(double, double, double); + float (*f_fma)(float, float, float, int); +} fptr; -typedef union dptr -{ - void *p; - long double (*f_f)(long double); - long double (*f_u)(cl_ulong); - int (*i_f)(long double); - long double (*f_ff)(long double, long double); - int (*i_ff)(long double, long double); - long double (*f_fi)(long double, int); - long double (*f_fpf)(long double, long double*); - long double (*f_fpI)(long double, int*); - long double (*f_ffpI)(long double, long double, int*); - long double (*f_fff)(long double, long double, long double); -}dptr; +typedef union dptr { + void *p; + long double (*f_f)(long double); + long double (*f_u)(cl_ulong); + int (*i_f)(long double); + long double (*f_ff)(long double, long double); + int (*i_ff)(long double, long double); + long double (*f_fi)(long double, int); + long double (*f_fpf)(long double, long double *); + long double (*f_fpI)(long double, int *); + long double (*f_ffpI)(long double, long double, int *); + long double (*f_fff)(long double, long double, long double); +} dptr; struct Func; typedef struct vtbl { - const char *type_name; + const char *type_name; int (*TestFunc)(const struct Func *, MTdata, bool); int (*DoubleTestFunc)( const struct Func *, MTdata, bool); // may be NULL if function is single precision only -}vtbl; +} vtbl; typedef struct Func { - const char *name; // common name, to be used as an argument in the shell - const char *nameInCode; // name as it appears in the __kernel, usually the same as name, but different for multiplication - fptr func; - dptr dfunc; - fptr rfunc; - float float_ulps; - float double_ulps; - float float_embedded_ulps; - float relaxed_error; - float relaxed_embedded_error; - int ftz; - int relaxed; - const vtbl *vtbl_ptr; -}Func; + const char *name; // common name, to be used as an argument in the shell + const char *nameInCode; // name as it appears in the __kernel, usually the + // same as name, but different for multiplication + fptr func; + dptr dfunc; + fptr rfunc; + float float_ulps; + float double_ulps; + float float_embedded_ulps; + float relaxed_error; + float relaxed_embedded_error; + int ftz; + int relaxed; + const vtbl *vtbl_ptr; +} Func; -extern const Func functionList[]; +extern const Func functionList[]; extern const size_t functionListCount; #endif - - diff --git a/test_conformance/math_brute_force/Sleep.cpp b/test_conformance/math_brute_force/Sleep.cpp index 4d3b2c64..7103779e 100644 --- a/test_conformance/math_brute_force/Sleep.cpp +++ b/test_conformance/math_brute_force/Sleep.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -16,103 +16,94 @@ #include "Sleep.h" #include "Utility.h" -#if defined( __APPLE__ ) - #include - #include +#if defined(__APPLE__) +#include +#include - struct - { - io_connect_t connection; - IONotificationPortRef port; - io_object_t iterator; - }sleepInfo; +struct +{ + io_connect_t connection; + IONotificationPortRef port; + io_object_t iterator; +} sleepInfo; - void sleepCallback( void * refcon, - io_service_t service, - natural_t messageType, - void * messageArgument ); +void sleepCallback(void* refcon, io_service_t service, natural_t messageType, + void* messageArgument); - void sleepCallback( void * refcon UNUSED, - io_service_t service UNUSED, - natural_t messageType, - void * messageArgument ) - { +void sleepCallback(void* refcon UNUSED, io_service_t service UNUSED, + natural_t messageType, void* messageArgument) +{ - IOReturn result; + IOReturn result; /* service -- The IOService whose state has changed. - messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the IOService's family. - messageArgument -- An argument for the message, dependent on the messageType. + messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the + IOService's family. messageArgument -- An argument for the message, + dependent on the messageType. */ - switch ( messageType ) - { - case kIOMessageSystemWillSleep: - // Handle demand sleep (such as sleep caused by running out of - // batteries, closing the lid of a laptop, or selecting - // sleep from the Apple menu. - IOAllowPowerChange(sleepInfo.connection,(long)messageArgument); - vlog( "Hard sleep occurred.\n" ); - break; - case kIOMessageCanSystemSleep: - // In this case, the computer has been idle for several minutes - // and will sleep soon so you must either allow or cancel - // this notification. Important: if you don’t respond, there will - // be a 30-second timeout before the computer sleeps. - // IOCancelPowerChange(root_port,(long)messageArgument); - result = IOCancelPowerChange(sleepInfo.connection,(long)messageArgument); - if( kIOReturnSuccess != result ) - vlog( "sleep prevention failed. (%d)\n", result); + switch (messageType) + { + case kIOMessageSystemWillSleep: + // Handle demand sleep (such as sleep caused by running out of + // batteries, closing the lid of a laptop, or selecting + // sleep from the Apple menu. + IOAllowPowerChange(sleepInfo.connection, (long)messageArgument); + vlog("Hard sleep occurred.\n"); + break; + case kIOMessageCanSystemSleep: + // In this case, the computer has been idle for several minutes + // and will sleep soon so you must either allow or cancel + // this notification. Important: if you don’t respond, there will + // be a 30-second timeout before the computer sleeps. + // IOCancelPowerChange(root_port,(long)messageArgument); + result = IOCancelPowerChange(sleepInfo.connection, + (long)messageArgument); + if (kIOReturnSuccess != result) + vlog("sleep prevention failed. (%d)\n", result); + break; + case kIOMessageSystemHasPoweredOn: + // Handle wakeup. break; - case kIOMessageSystemHasPoweredOn: - // Handle wakeup. - break; - } } +} #endif - - - -void PreventSleep( void ) +void PreventSleep(void) { -#if defined( __APPLE__ ) - vlog( "Disabling sleep... " ); - sleepInfo.iterator = (io_object_t) 0; +#if defined(__APPLE__) + vlog("Disabling sleep... "); + sleepInfo.iterator = (io_object_t)0; sleepInfo.port = NULL; - sleepInfo.connection = IORegisterForSystemPower - ( - &sleepInfo, //void * refcon, - &sleepInfo.port, //IONotificationPortRef * thePortRef, - sleepCallback, //IOServiceInterestCallback callback, - &sleepInfo.iterator //io_object_t * notifier - ); + sleepInfo.connection = IORegisterForSystemPower( + &sleepInfo, // void * refcon, + &sleepInfo.port, // IONotificationPortRef * thePortRef, + sleepCallback, // IOServiceInterestCallback callback, + &sleepInfo.iterator // io_object_t * notifier + ); - if( (io_connect_t) 0 == sleepInfo.connection ) - vlog( "failed.\n" ); + if ((io_connect_t)0 == sleepInfo.connection) + vlog("failed.\n"); else - vlog( "done.\n" ); + vlog("done.\n"); CFRunLoopAddSource(CFRunLoopGetCurrent(), - IONotificationPortGetRunLoopSource(sleepInfo.port), - kCFRunLoopDefaultMode); + IONotificationPortGetRunLoopSource(sleepInfo.port), + kCFRunLoopDefaultMode); #else - vlog( "*** PreventSleep() is not implemented on this platform.\n" ); + vlog("*** PreventSleep() is not implemented on this platform.\n"); #endif } -void ResumeSleep( void ) +void ResumeSleep(void) { -#if defined( __APPLE__ ) - IOReturn result = IODeregisterForSystemPower ( &sleepInfo.iterator ); - if( 0 != result ) - vlog( "Got error %d restoring sleep \n", result ); +#if defined(__APPLE__) + IOReturn result = IODeregisterForSystemPower(&sleepInfo.iterator); + if (0 != result) + vlog("Got error %d restoring sleep \n", result); else - vlog( "Sleep restored.\n" ); + vlog("Sleep restored.\n"); #else - vlog( "*** ResumeSleep() is not implemented on this platform.\n" ); + vlog("*** ResumeSleep() is not implemented on this platform.\n"); #endif } - - - diff --git a/test_conformance/math_brute_force/Sleep.h b/test_conformance/math_brute_force/Sleep.h index f983a32f..ca643954 100644 --- a/test_conformance/math_brute_force/Sleep.h +++ b/test_conformance/math_brute_force/Sleep.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -16,9 +16,7 @@ #ifndef SLEEP_H #define SLEEP_H -void PreventSleep( void ); -void ResumeSleep( void ); +void PreventSleep(void); +void ResumeSleep(void); #endif /* SLEEP_H */ - - diff --git a/test_conformance/math_brute_force/Utility.cpp b/test_conformance/math_brute_force/Utility.cpp index 9ab7c7fa..3d8d9baa 100644 --- a/test_conformance/math_brute_force/Utility.cpp +++ b/test_conformance/math_brute_force/Utility.cpp @@ -17,9 +17,9 @@ #include "FunctionList.h" #if defined(__PPC__) -// Global varaiable used to hold the FPU control register state. The FPSCR register can not -// be used because not all Power implementations retain or observed the NI (non-IEEE -// mode) bit. +// Global varaiable used to hold the FPU control register state. The FPSCR +// register can not be used because not all Power implementations retain or +// observed the NI (non-IEEE mode) bit. __thread fpu_control_t fpu_control = 0; #endif @@ -28,16 +28,16 @@ void MulD(double *rhi, double *rlo, double u, double v) const double c = 134217729.0; // 1+2^27 double up, u1, u2, vp, v1, v2; - up = u*c; + up = u * c; u1 = (u - up) + up; u2 = u - u1; - vp = v*c; + vp = v * c; v1 = (v - vp) + vp; v2 = v - v1; - double rh = u*v; - double rl = (((u1*v1 - rh) + (u1*v2)) + (u2*v1)) + (u2*v2); + double rh = u * v; + double rl = (((u1 * v1 - rh) + (u1 * v2)) + (u2 * v1)) + (u2 * v2); *rhi = rh; *rlo = rl; @@ -47,11 +47,13 @@ void AddD(double *rhi, double *rlo, double a, double b) { double zhi, zlo; zhi = a + b; - if(fabs(a) > fabs(b)) { + if (fabs(a) > fabs(b)) + { zlo = zhi - a; zlo = b - zlo; } - else { + else + { zlo = zhi - b; zlo = a - zlo; } @@ -66,17 +68,17 @@ void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl) double c = 134217729.0; double up, u1, u2, vp, v1, v2; - up = xh*c; + up = xh * c; u1 = (xh - up) + up; u2 = xh - u1; - vp = yh*c; + vp = yh * c; v1 = (yh - vp) + vp; v2 = yh - v1; - mh = xh*yh; - ml = (((u1*v1 - mh) + (u1*v2)) + (u2*v1)) + (u2*v2); - ml += xh*yl + xl*yh; + mh = xh * yh; + ml = (((u1 * v1 - mh) + (u1 * v2)) + (u2 * v1)) + (u2 * v2); + ml += xh * yl + xl * yh; *rhi = mh + ml; *rlo = (mh - (*rhi)) + ml; @@ -86,7 +88,8 @@ void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl) { double r, s; r = xh + yh; - s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) : (yh - r + xh + xl + yl); + s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) + : (yh - r + xh + xl + yl); *rhi = r + s; *rlo = (r - (*rhi)) + s; } @@ -100,72 +103,61 @@ void DivideDD(double *chi, double *clo, double a, double b) *clo = rhi / b; } -// These functions comapre two floats/doubles. Since some platforms may choose to -// flush denormals to zeros before comparison, comparison like a < b may give wrong -// result in "certain cases" where we do need correct compasion result when operands -// are denormals .... these functions comapre floats/doubles using signed integer/long int -// rep. In other cases, when flushing to zeros is fine, these should not be used. -// Also these doesn't check for nans and assume nans are handled separately as special edge case -// by the caller which calls these functions -// return 0 if both are equal, 1 if x > y and -1 if x < y. +// These functions comapre two floats/doubles. Since some platforms may choose +// to flush denormals to zeros before comparison, comparison like a < b may give +// wrong result in "certain cases" where we do need correct compasion result +// when operands are denormals .... these functions comapre floats/doubles using +// signed integer/long int rep. In other cases, when flushing to zeros is fine, +// these should not be used. Also these doesn't check for nans and assume nans +// are handled separately as special edge case by the caller which calls these +// functions return 0 if both are equal, 1 if x > y and -1 if x < y. -inline -int compareFloats(float x, float y) +inline int compareFloats(float x, float y) { int32f_t a, b; a.f = x; b.f = y; - if( a.i & 0x80000000 ) - a.i = 0x80000000 - a.i; - if( b.i & 0x80000000 ) - b.i = 0x80000000 - b.i; + if (a.i & 0x80000000) a.i = 0x80000000 - a.i; + if (b.i & 0x80000000) b.i = 0x80000000 - b.i; - if( a.i == b.i ) - return 0; + if (a.i == b.i) return 0; return a.i < b.i ? -1 : 1; } -inline -int compareDoubles(double x, double y) +inline int compareDoubles(double x, double y) { int64d_t a, b; a.d = x; b.d = y; - if( a.l & 0x8000000000000000LL ) - a.l = 0x8000000000000000LL - a.l; - if( b.l & 0x8000000000000000LL ) - b.l = 0x8000000000000000LL - b.l; + if (a.l & 0x8000000000000000LL) a.l = 0x8000000000000000LL - a.l; + if (b.l & 0x8000000000000000LL) b.l = 0x8000000000000000LL - b.l; - if( a.l == b.l ) - return 0; + if (a.l == b.l) return 0; return a.l < b.l ? -1 : 1; } -void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed) +void logFunctionInfo(const char *fname, unsigned int float_size, + unsigned int isFastRelaxed) { char const *fpSizeStr = NULL; char const *fpFastRelaxedStr = ""; - switch (float_size) { - case sizeof(cl_double): - fpSizeStr = "fp64"; - break; - case sizeof(cl_float): - fpSizeStr = "fp32"; - break; - case sizeof(cl_half): - fpSizeStr = "fp16"; - break; + switch (float_size) + { + case sizeof(cl_double): fpSizeStr = "fp64"; break; + case sizeof(cl_float): fpSizeStr = "fp32"; break; + case sizeof(cl_half): fpSizeStr = "fp16"; break; } - if (isFastRelaxed) { + if (isFastRelaxed) + { fpFastRelaxedStr = "rlx"; } - vlog("%15s %4s %4s",fname, fpSizeStr, fpFastRelaxedStr); + vlog("%15s %4s %4s", fname, fpSizeStr, fpFastRelaxedStr); } float getAllowedUlpError(const Func *f, const bool relaxed) diff --git a/test_conformance/math_brute_force/Utility.h b/test_conformance/math_brute_force/Utility.h index 92f8f3dc..dd3c5e56 100644 --- a/test_conformance/math_brute_force/Utility.h +++ b/test_conformance/math_brute_force/Utility.h @@ -30,13 +30,13 @@ #include "harness/ThreadPool.h" #include "harness/conversions.h" -#define BUFFER_SIZE (1024*1024*2) +#define BUFFER_SIZE (1024 * 1024 * 2) #define EMBEDDED_REDUCTION_FACTOR (64) -#if defined( __GNUC__ ) - #define UNUSED __attribute__ ((unused)) +#if defined(__GNUC__) +#define UNUSED __attribute__((unused)) #else - #define UNUSED +#define UNUSED #endif struct Func; @@ -44,62 +44,62 @@ struct Func; extern int gWimpyBufferSize; extern int gWimpyReductionFactor; -#define VECTOR_SIZE_COUNT 6 +#define VECTOR_SIZE_COUNT 6 extern const char *sizeNames[VECTOR_SIZE_COUNT]; -extern const int sizeValues[VECTOR_SIZE_COUNT]; +extern const int sizeValues[VECTOR_SIZE_COUNT]; -extern cl_device_id gDevice; -extern cl_context gContext; +extern cl_device_id gDevice; +extern cl_context gContext; extern cl_command_queue gQueue; -extern void *gIn; -extern void *gIn2; -extern void *gIn3; -extern void *gOut_Ref; -extern void *gOut_Ref2; -extern void *gOut[VECTOR_SIZE_COUNT]; -extern void *gOut2[VECTOR_SIZE_COUNT]; -extern cl_mem gInBuffer; -extern cl_mem gInBuffer2; -extern cl_mem gInBuffer3; -extern cl_mem gOutBuffer[VECTOR_SIZE_COUNT]; -extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT]; -extern uint32_t gComputeDevices; -extern uint32_t gSimdSize; -extern int gSkipCorrectnessTesting; -extern int gMeasureTimes; -extern int gReportAverageTimes; -extern int gForceFTZ; -extern int gFastRelaxedDerived; -extern int gWimpyMode; -extern int gHasDouble; -extern int gIsInRTZMode; -extern int gInfNanSupport; -extern int gIsEmbedded; -extern int gVerboseBruteForce; -extern uint32_t gMaxVectorSizeIndex; -extern uint32_t gMinVectorSizeIndex; -extern uint32_t gDeviceFrequency; +extern void *gIn; +extern void *gIn2; +extern void *gIn3; +extern void *gOut_Ref; +extern void *gOut_Ref2; +extern void *gOut[VECTOR_SIZE_COUNT]; +extern void *gOut2[VECTOR_SIZE_COUNT]; +extern cl_mem gInBuffer; +extern cl_mem gInBuffer2; +extern cl_mem gInBuffer3; +extern cl_mem gOutBuffer[VECTOR_SIZE_COUNT]; +extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT]; +extern uint32_t gComputeDevices; +extern uint32_t gSimdSize; +extern int gSkipCorrectnessTesting; +extern int gMeasureTimes; +extern int gReportAverageTimes; +extern int gForceFTZ; +extern int gFastRelaxedDerived; +extern int gWimpyMode; +extern int gHasDouble; +extern int gIsInRTZMode; +extern int gInfNanSupport; +extern int gIsEmbedded; +extern int gVerboseBruteForce; +extern uint32_t gMaxVectorSizeIndex; +extern uint32_t gMinVectorSizeIndex; +extern uint32_t gDeviceFrequency; extern cl_device_fp_config gFloatCapabilities; extern cl_device_fp_config gDoubleCapabilities; -#define LOWER_IS_BETTER 0 -#define HIGHER_IS_BETTER 1 +#define LOWER_IS_BETTER 0 +#define HIGHER_IS_BETTER 1 #include "harness/errorHelpers.h" -#if defined (_MSC_VER ) - //Deal with missing scalbn on windows - #define scalbnf( _a, _i ) ldexpf( _a, _i ) - #define scalbn( _a, _i ) ldexp( _a, _i ) - #define scalbnl( _a, _i ) ldexpl( _a, _i ) +#if defined(_MSC_VER) +// Deal with missing scalbn on windows +#define scalbnf(_a, _i) ldexpf(_a, _i) +#define scalbn(_a, _i) ldexp(_a, _i) +#define scalbnl(_a, _i) ldexpl(_a, _i) #endif -float Abs_Error( float test, double reference ); -float Ulp_Error( float test, double reference ); -float Bruteforce_Ulp_Error_Double( double test, long double reference ); +float Abs_Error(float test, double reference); +float Ulp_Error(float test, double reference); +float Bruteforce_Ulp_Error_Double(double test, long double reference); -uint64_t GetTime( void ); -double SubtractTime( uint64_t endTime, uint64_t startTime ); +uint64_t GetTime(void); +double SubtractTime(uint64_t endTime, uint64_t startTime); int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k, cl_program *p, bool relaxedMode); int MakeKernels(const char **c, cl_uint count, const char *name, @@ -107,69 +107,84 @@ int MakeKernels(const char **c, cl_uint count, const char *name, bool relaxedMode); // used to convert a bucket of bits into a search pattern through double -static inline double DoubleFromUInt32( uint32_t bits ); -static inline double DoubleFromUInt32( uint32_t bits ) +static inline double DoubleFromUInt32(uint32_t bits); +static inline double DoubleFromUInt32(uint32_t bits) { - union{ uint64_t u; double d;} u; + union { + uint64_t u; + double d; + } u; // split 0x89abcdef to 0x89abc00000000def u.u = bits & 0xfffU; - u.u |= (uint64_t) (bits & ~0xfffU) << 32; + u.u |= (uint64_t)(bits & ~0xfffU) << 32; - // sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s + // sign extend the leading bit of def segment as sign bit so that the middle + // region consists of either all 1s or 0s u.u -= (bits & 0x800U) << 1; // return result return u.d; } -void _LogBuildError( cl_program p, int line, const char *file ); -#define LogBuildError( program ) _LogBuildError( program, __LINE__, __FILE__ ) +void _LogBuildError(cl_program p, int line, const char *file); +#define LogBuildError(program) _LogBuildError(program, __LINE__, __FILE__) #define PERF_LOOP_COUNT 100 -//The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero. -// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero. -static inline int IsFloatResultSubnormal( double x, float ulps ) +// The spec is fairly clear that we may enforce a hard cutoff to prevent +// premature flushing to zero. +// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + +// ulp_limit to be flushed to zero. +static inline int IsFloatResultSubnormal(double x, float ulps) { - x = fabs(x) - MAKE_HEX_DOUBLE( 0x1.0p-149, 0x1, -149) * (double) ulps; - return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 ); + x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps; + return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } -static inline int IsFloatResultSubnormalAbsError( double x , float abs_err) +static inline int IsFloatResultSubnormalAbsError(double x, float abs_err) { - x = x - abs_err; - return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 ); + x = x - abs_err; + return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } -static inline int IsDoubleResultSubnormal( long double x, float ulps ) +static inline int IsDoubleResultSubnormal(long double x, float ulps) { - x = fabsl(x) - MAKE_HEX_LONG( 0x1.0p-1074, 0x1, -1074) * (long double) ulps; - return x < MAKE_HEX_LONG( 0x1.0p-1022, 0x1, -1022 ); + x = fabsl(x) - MAKE_HEX_LONG(0x1.0p-1074, 0x1, -1074) * (long double)ulps; + return x < MAKE_HEX_LONG(0x1.0p-1022, 0x1, -1022); } static inline int IsFloatInfinity(double x) { - union { cl_float d; cl_uint u; } u; - u.d = (cl_float) x; - return ((u.u & 0x7fffffffU) == 0x7F800000U); + union { + cl_float d; + cl_uint u; + } u; + u.d = (cl_float)x; + return ((u.u & 0x7fffffffU) == 0x7F800000U); } static inline int IsFloatMaxFloat(double x) { - union { cl_float d; cl_uint u; } u; - u.d = (cl_float) x; - return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU); + union { + cl_float d; + cl_uint u; + } u; + u.d = (cl_float)x; + return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU); } static inline int IsFloatNaN(double x) { - union { cl_float d; cl_uint u; } u; - u.d = (cl_float) x; - return ((u.u & 0x7fffffffU) > 0x7F800000U); + union { + cl_float d; + cl_uint u; + } u; + u.d = (cl_float)x; + return ((u.u & 0x7fffffffU) > 0x7F800000U); } -extern cl_uint RoundUpToNextPowerOfTwo( cl_uint x ); +extern cl_uint RoundUpToNextPowerOfTwo(cl_uint x); // Windows (since long double got deprecated) sets the x87 to 53-bit precision // (that's x87 default state). This causes problems with the tests that @@ -186,46 +201,50 @@ static inline void Force64BitFPUPrecision(void) // divergent code just use inline assembly which works for both. unsigned short int orig_cw = 0; unsigned short int new_cw = 0; - __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw)); - new_cw = orig_cw | 0x0300; // set precision to 64-bit - __asm__ __volatile__ ("fldcw %0"::"m" (new_cw)); -#elif defined( _WIN32 ) && defined( __INTEL_COMPILER ) - // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* work on win.x64: - // > On the x64 architecture, changing the floating point precision is not supported. - // (Taken from http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx) + __asm__ __volatile__("fstcw %0" : "=m"(orig_cw)); + new_cw = orig_cw | 0x0300; // set precision to 64-bit + __asm__ __volatile__("fldcw %0" ::"m"(new_cw)); +#elif defined(_WIN32) && defined(__INTEL_COMPILER) + // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* + // work on win.x64: > On the x64 architecture, changing the floating point + // precision is not supported. (Taken from + // http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx) int cw; - __asm { fnstcw cw }; // Get current value of FPU control word. - cw = cw & 0xfffffcff | ( 3 << 8 ); // Set Precision Control to Double Extended Precision. - __asm { fldcw cw }; // Set new value of FPU control word. + __asm { fnstcw cw } + ; // Get current value of FPU control word. + cw = cw & 0xfffffcff + | (3 << 8); // Set Precision Control to Double Extended Precision. + __asm { fldcw cw } + ; // Set new value of FPU control word. #else /* Implement for other platforms if needed */ #endif } -extern -void memset_pattern4(void *dest, const void *src_pattern, size_t bytes ); +extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes); -typedef union -{ +typedef union { int32_t i; - float f; -}int32f_t; + float f; +} int32f_t; -typedef union -{ +typedef union { int64_t l; - double d; -}int64d_t; + double d; +} int64d_t; void MulD(double *rhi, double *rlo, double u, double v); void AddD(double *rhi, double *rlo, double a, double b); -void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl); -void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl); +void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, + double yl); +void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, + double yl); void DivideDD(double *chi, double *clo, double a, double b); int compareFloats(float x, float y); int compareDoubles(double x, double y); -void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed); +void logFunctionInfo(const char *fname, unsigned int float_size, + unsigned int isFastRelaxed); float getAllowedUlpError(const Func *f, const bool relaxed); diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp index 0b8be27b..db961c8d 100644 --- a/test_conformance/math_brute_force/binary.cpp +++ b/test_conformance/math_brute_force/binary.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -46,63 +46,82 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i] );\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " f0 = ", name, "( f0, f1 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 f0, f1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0, f1 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in, __global float* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 f1 = vload3( 0, in2 + 3 * i );\n" + " f0 = ", + name, + "( f0, f1 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0, f1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " f1 = (float3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, f1 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -112,65 +131,84 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i] );\n" - "}\n" - }; + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " d0 = ", name, "( d0, d1 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 d0, d1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", name, "( d0, d1 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global double* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " d0 = ", + name, + "( d0, d1 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0, d1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, d1 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -178,115 +216,215 @@ static int BuildKernelDouble(const char *name, int vectorSize, // A table of more difficult cases to get right static const float specialValuesFloat[] = { - -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), - MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f, -4.0f, -3.5f, - -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), - MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), - MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), - MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f, + -NAN, + -INFINITY, + -FLT_MAX, + MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), + MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), + MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), + MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), + MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), + MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), + MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), + MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), + MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), + MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), + MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), + MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), + -1000.f, + -100.f, + -4.0f, + -3.5f, + -3.0f, + MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), + -2.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), + -2.0f, + MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), + -1.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), + MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), + -1.0f, + MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), + MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), + -0.5f, + MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), + MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), + -0.25f, + MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), + MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), + -FLT_MIN, + MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), + MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), + MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), + MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), + MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), + MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), + MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), + MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), + MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), + MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), + -0.0f, - +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), - MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f, - +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), - MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), - MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), - MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f + +NAN, + +INFINITY, + +FLT_MAX, + MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), + MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), + MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), + MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), + MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), + MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), + MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), + MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), + MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), + MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), + MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), + MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), + +1000.f, + +100.f, + +4.0f, + +3.5f, + +3.0f, + MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), + 2.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), + +2.0f, + MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), + 1.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), + MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), + +1.0f, + MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), + MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), + +0.5f, + MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), + MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), + +0.25f, + MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), + MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), + +FLT_MIN, + MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), + MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), + MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), + MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), + MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), + MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), + MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), + MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), + MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), + MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), + +0.0f }; -static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] ); +static size_t specialValuesFloatCount = + sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -//Thread specific data for a worker thread +// Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread - float maxError; // max error value. Init to 0. - double maxErrorValue; // position of the max error value (param 1). Init to 0. - double maxErrorValue2; // position of the max error value (param 2). Init to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -}ThreadInfo; + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + double maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; typedef struct TestInfo { - size_t subBufferSize; // Size of the sub-buffer in elements - const Func *f; // A pointer to the function info - cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes - cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id] - ThreadInfo *tinfo; // An array of thread specific information for each worker thread - cl_uint threadCount; // Number of worker threads - cl_uint jobCount; // Number of jobs - cl_uint step; // step between each chunk and the next. - cl_uint scale; // stride between individual test values - float ulps; // max_allowed ulps - int ftz; // non-zero if running in flush to zero mode + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode - int isFDim; - int skipNanInf; - int isNextafter; + int isFDim; + int skipNanInf; + int isNextafter; bool relaxedMode; // True if test is running in relaxed mode, false // otherwise. } TestInfo; -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - int skipTestingRelaxed = 0; + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + int skipTestingRelaxed = 0; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_float)); - if (gWimpyMode){ - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -296,62 +434,83 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter, test_info.f = f; test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - test_info.isFDim = 0 == strcmp( "fdim", f->nameInCode ); - test_info.skipNanInf = test_info.isFDim && ! gInfNanSupport; + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; test_info.isNextafter = isNextafter; test_info.relaxedMode = relaxedMode; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf2 ) + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer2 for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gOutBuffer[%d] for region {%zd, %zd}\n", (int) j, region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } @@ -364,19 +523,21 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter, gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } // Run the kernels - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; @@ -384,176 +545,200 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter, } } - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) { p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000; p2[j] = 0x3fc00000; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - free_mtdata( test_info.tinfo[i].d ); + free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_float ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - fptr func = job->f->func; - int ftz = job->ftz; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + fptr func = job->f->func; + int ftz = job->ftz; bool relaxedMode = job->relaxedMode; float ulps = getAllowedUlpError(job->f, relaxedMode); - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - cl_uchar *overflow = (cl_uchar*)malloc(buffer_size); - const char *name = job->f->name; - int isFDim = job->isFDim; - int skipNanInf = job->skipNanInf; - int isNextafter = job->isNextafter; - cl_uint *t = 0; - float *r=0,*s=0,*s2=0; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + cl_uchar *overflow = (cl_uchar *)malloc(buffer_size); + const char *name = job->f->name; + int isFDim = job->isFDim; + int skipNanInf = job->skipNanInf; + int isNextafter = job->isNextafter; + cl_uint *t = 0; + float *r = 0, *s = 0, *s2 = 0; cl_int copysign_test = 0; RoundingMode oldRoundMode; int skipVerification = 0; if (relaxedMode) { - if (strcmp(name,"pow")==0 && gFastRelaxedDerived) - { - func = job->f->rfunc; - ulps = INFINITY; - skipVerification = 1; - }else - { - func = job->f->rfunc; - } + if (strcmp(name, "pow") == 0 && gFastRelaxedDerived) + { + func = job->f->rfunc; + ulps = INFINITY; + skipVerification = 1; + } + else + { + func = job->f->rfunc; + } } // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_uint *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_uint *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_uint *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount; + int totalSpecialValueCount = + specialValuesFloatCount * specialValuesFloatCount; int indx = (totalSpecialValueCount - 1) / buffer_elements; if (job_id <= (cl_uint)indx) @@ -562,91 +747,111 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) float *fp2 = (float *)p2; uint32_t x, y; - x = (job_id * buffer_elements) % specialValuesFloatCount; - y = (job_id * buffer_elements) / specialValuesFloatCount; + x = (job_id * buffer_elements) % specialValuesFloatCount; + y = (job_id * buffer_elements) / specialValuesFloatCount; - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { fp[j] = specialValuesFloat[x]; fp2[j] = specialValuesFloat[y]; - if( ++x >= specialValuesFloatCount ) + if (++x >= specialValuesFloatCount) { x = 0; y++; - if( y >= specialValuesFloatCount ) - break; + if (y >= specialValuesFloatCount) break; } } } - //Init any remaining values. - for( ; j < buffer_elements; j++ ) + // Init any remaining values. + for (; j < buffer_elements; j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) + if (gSkipCorrectnessTesting) { - if( (error = clFinish(tinfo->tQueue)) ) + if ((error = clFinish(tinfo->tQueue))) { - vlog_error( "Error: clFinish failed! err: %d\n", error ); - goto exit; + vlog_error("Error: clFinish failed! err: %d\n", error); + goto exit; } free(overflow); return CL_SUCCESS; @@ -654,105 +859,111 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) FPU_mode_type oldMode; oldRoundMode = kRoundToNearestEven; - if( isFDim ) + if (isFDim) { - //Calculate the correctly rounded reference result - memset( &oldMode, 0, sizeof( oldMode ) ); - if( ftz ) - ForceFTZ( &oldMode ); + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); // Set the rounding mode to match the device - if (gIsInRTZMode) - oldRoundMode = set_round(kRoundTowardZero, kfloat); + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); } - if(!strcmp(name, "copysign")) - copysign_test = 1; + if (!strcmp(name, "copysign")) copysign_test = 1; -#define ref_func(s, s2) (copysign_test ? func.f_ff_f( s, s2 ) : func.f_ff( s, s2 )) +#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2)) - //Calculate the correctly rounded reference result - r = (float *)gOut_Ref + thread_id * buffer_elements; - s = (float *)gIn + thread_id * buffer_elements; - s2 = (float *)gIn2 + thread_id * buffer_elements; - if( skipNanInf ) + // Calculate the correctly rounded reference result + r = (float *)gOut_Ref + thread_id * buffer_elements; + s = (float *)gIn + thread_id * buffer_elements; + s2 = (float *)gIn2 + thread_id * buffer_elements; + if (skipNanInf) { - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { feclearexcept(FE_OVERFLOW); - r[j] = (float) ref_func( s[j], s2[j] ); - overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + r[j] = (float)ref_func(s[j], s2[j]); + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); } } else { - for( j = 0; j < buffer_elements; j++ ) - r[j] = (float) ref_func( s[j], s2[j] ); + for (j = 0; j < buffer_elements; j++) + r[j] = (float)ref_func(s[j], s2[j]); } - if( isFDim && ftz ) - RestoreFPState( &oldMode ); + if (isFDim && ftz) RestoreFPState(&oldMode); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_uint *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (cl_uint*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - if (!skipVerification) { - //Verify data + if (!skipVerification) + { + // Verify data t = (cl_uint *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_uint *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - float test = ((float*) q)[j]; - double correct = ref_func( s[j], s2[j] ); + float test = ((float *)q)[j]; + double correct = ref_func(s[j], s2[j]); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables - // -cl-finite-math-only optimization. This optimization allows to assume that arguments and - // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs. + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow As per + // OpenCL 2.0 spec, section 5.8.4.3, enabling + // fast-relaxed-math mode also enables -cl-finite-math-only + // optimization. This optimization allows to assume that + // arguments and results are not NaNs or +/-INFs. Hence, + // accept any result if inputs or results are NaNs or INFs. if (relaxedMode || skipNanInf) { - if( skipNanInf && overflow[j]) - continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct) || IsFloatNaN(correct) || - IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) || - IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) ) + if (skipNanInf && overflow[j]) continue; + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) continue; } - float err = Ulp_Error( test, correct ); - int fail = ! (fabsf(err) <= ulps); + float err = Ulp_Error(test, correct); + int fail = !(fabsf(err) <= ulps); - if( fail && ftz ) + if (fail && ftz) { // retry per section 6.5.3.2 - if( IsFloatResultSubnormal(correct, ulps ) ) + if (IsFloatResultSubnormal(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // nextafter on FTZ platforms may return the smallest @@ -765,171 +976,203 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) // normal number is the next representable number. // In which case, it should have the same sign as the // second argument. - if (isNextafter ) + if (isNextafter) { - if(IsFloatSubnormal(s[j]) || s[j] == 0.0f) + if (IsFloatSubnormal(s[j]) || s[j] == 0.0f) { float value = copysignf(twoToMinus126, s2[j]); fail = fail && (test != value); - if (!fail) - err = 0.0f; + if (!fail) err = 0.0f; } } else { // retry per section 6.5.3.3 - if( IsFloatSubnormal( s[j] ) ) + if (IsFloatSubnormal(s[j])) { double correct2, correct3; float err2, err3; - if( skipNanInf ) - feclearexcept(FE_OVERFLOW); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = ref_func( 0.0, s2[j] ); - correct3 = ref_func( -0.0, s2[j] ); + correct2 = ref_func(0.0, s2[j]); + correct3 = ref_func(-0.0, s2[j]); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables - // -cl-finite-math-only optimization. This optimization allows to assume that arguments and - // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs. + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow As per OpenCL 2.0 spec, + // section 5.8.4.3, enabling fast-relaxed-math + // mode also enables -cl-finite-math-only + // optimization. This optimization allows to + // assume that arguments and results are not + // NaNs or +/-INFs. Hence, accept any result if + // inputs or results are NaNs or INFs. if (relaxedMode || skipNanInf) { - if( fetestexcept(FE_OVERFLOW) && skipNanInf ) + if (fetestexcept(FE_OVERFLOW) && skipNanInf) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ) + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } - //try with both args as zero - if( IsFloatSubnormal( s2[j] ) ) + // try with both args as zero + if (IsFloatSubnormal(s2[j])) { double correct4, correct5; float err4, err5; - if( skipNanInf ) - feclearexcept(FE_OVERFLOW); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = ref_func( 0.0, 0.0 ); - correct3 = ref_func( -0.0, 0.0 ); - correct4 = ref_func( 0.0, -0.0 ); - correct5 = ref_func( -0.0, -0.0 ); + correct2 = ref_func(0.0, 0.0); + correct3 = ref_func(-0.0, 0.0); + correct4 = ref_func(0.0, -0.0); + correct5 = ref_func(-0.0, -0.0); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables - // -cl-finite-math-only optimization. This optimization allows to assume that arguments and - // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs. + // Per section 10 paragraph 6, accept any + // result if an input or output is a + // infinity or NaN or overflow As per + // OpenCL 2.0 spec, section 5.8.4.3, + // enabling fast-relaxed-math mode also + // enables -cl-finite-math-only + // optimization. This optimization allows to + // assume that arguments and results are not + // NaNs or +/-INFs. Hence, accept any result + // if inputs or results are NaNs or INFs. if (relaxedMode || skipNanInf) { - if( fetestexcept(FE_OVERFLOW) && skipNanInf ) + if (fetestexcept(FE_OVERFLOW) + && skipNanInf) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) || - IsFloatInfinity(correct4) || IsFloatNaN(correct4) || - IsFloatInfinity(correct5) || IsFloatNaN(correct5) ) + // Note: no double rounding here. + // Reference functions calculate in + // single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - err4 = Ulp_Error( test, correct4 ); - err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) && - (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + err4 = Ulp_Error(test, correct4); + err5 = Ulp_Error(test, correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) || - IsFloatResultSubnormal( correct4, ulps ) || IsFloatResultSubnormal( correct5, ulps ) ) + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, + ulps) + || IsFloatResultSubnormal(correct4, + ulps) + || IsFloatResultSubnormal(correct5, + ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - else if(IsFloatSubnormal(s2[j]) ) + else if (IsFloatSubnormal(s2[j])) { double correct2, correct3; float err2, err3; - if( skipNanInf ) - feclearexcept(FE_OVERFLOW); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = ref_func( s[j], 0.0 ); - correct3 = ref_func( s[j], -0.0 ); + correct2 = ref_func(s[j], 0.0); + correct3 = ref_func(s[j], -0.0); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - // As per OpenCL 2.0 spec, section 5.8.4.3, enabling fast-relaxed-math mode also enables - // -cl-finite-math-only optimization. This optimization allows to assume that arguments and - // results are not NaNs or +/-INFs. Hence, accept any result if inputs or results are NaNs or INFs. + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow As per OpenCL 2.0 spec, + // section 5.8.4.3, enabling fast-relaxed-math + // mode also enables -cl-finite-math-only + // optimization. This optimization allows to + // assume that arguments and results are not + // NaNs or +/-INFs. Hence, accept any result if + // inputs or results are NaNs or INFs. if (relaxedMode || skipNanInf) { - // Note: no double rounding here. Reference functions calculate in single precision. - if( overflow[j] && skipNanInf) - continue; + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (overflow[j] && skipNanInf) continue; - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) ) + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ) + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; tinfo->maxErrorValue2 = s2[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a (0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n", name, sizeNames[k], err, s[j], ((cl_uint*)s)[j], s2[j], ((cl_uint*)s2)[j], r[j], test, ((cl_uint*)&test)[0], j ); + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a " + "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n", + name, sizeNames[k], err, s[j], ((cl_uint *)s)[j], + s2[j], ((cl_uint *)s2)[j], r[j], test, + ((cl_uint *)&test)[0], j); error = -1; goto exit; } @@ -938,93 +1181,192 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) } } - if (isFDim && gIsInRTZMode) - (void)set_round(oldRoundMode, kfloat); + if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else { - vlog("." ); + vlog("."); } fflush(stdout); } exit: - if( overflow ) - free( overflow ); + if (overflow) free(overflow); return error; - } // A table of more difficult cases to get right static const double specialValuesDouble[] = { - -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100., -4.0, -3.5, - -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0, + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), + MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), + -1000., + -100., + -4.0, + -3.5, + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), + -0.5, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), + -0.25, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, - +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100., +4.0, +3.5, - +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0, + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), + MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), + +1000., + +100., + +4.0, + +3.5, + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), + +0.5, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), + +0.25, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, }; -static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] ); +static size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, int isNextafter, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_double)); - if (gWimpyMode){ - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -1036,59 +1378,79 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, test_info.ulps = f->double_ulps; test_info.ftz = f->ftz || gForceFTZ; - test_info.isFDim = 0 == strcmp( "fdim", f->nameInCode ); + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); test_info.skipNanInf = 0; test_info.isNextafter = isNextafter; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } test_info.tinfo[i].d = init_genrand(genrand_int32(d)); @@ -1101,18 +1463,20 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; @@ -1120,300 +1484,346 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, } } - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays double *p = (double *)gIn; double *p2 = (double *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = DoubleFromUInt32(genrand_int32(d)); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); exit: // Release - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - free_mtdata( test_info.tinfo[i].d ); + free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_double ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - dptr func = job->f->dfunc; - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + dptr func = job->f->dfunc; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; - int isNextafter = job->isNextafter; - cl_ulong *t; - cl_double *r,*s,*s2; + int isNextafter = job->isNextafter; + cl_ulong *t; + cl_double *r, *s, *s2; Force64BitFPUPrecision(); // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_ulong *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_ulong *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements; cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount; + int totalSpecialValueCount = + specialValuesDoubleCount * specialValuesDoubleCount; int indx = (totalSpecialValueCount - 1) / buffer_elements; - if( job_id <= (cl_uint)indx ) + if (job_id <= (cl_uint)indx) { // test edge cases cl_double *fp = (cl_double *)p; cl_double *fp2 = (cl_double *)p2; uint32_t x, y; - x = (job_id * buffer_elements) % specialValuesDoubleCount; - y = (job_id * buffer_elements) / specialValuesDoubleCount; + x = (job_id * buffer_elements) % specialValuesDoubleCount; + y = (job_id * buffer_elements) / specialValuesDoubleCount; - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { fp[j] = specialValuesDouble[x]; fp2[j] = specialValuesDouble[y]; - if( ++x >= specialValuesDoubleCount ) + if (++x >= specialValuesDoubleCount) { x = 0; y++; - if( y >= specialValuesDoubleCount ) - break; + if (y >= specialValuesDoubleCount) break; } } } - //Init any remaining values. - for( ; j < buffer_elements; j++ ) + // Init any remaining values. + for (; j < buffer_elements; j++) { p[j] = genrand_int64(d); p2[j] = genrand_int64(d); } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result - r = (cl_double *)gOut_Ref + thread_id * buffer_elements; - s = (cl_double *)gIn + thread_id * buffer_elements; - s2 = (cl_double *)gIn2 + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - r[j] = (cl_double) func.f_ff( s[j], s2[j] ); + // Calculate the correctly rounded reference result + r = (cl_double *)gOut_Ref + thread_id * buffer_elements; + s = (cl_double *)gIn + thread_id * buffer_elements; + s2 = (cl_double *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + r[j] = (cl_double)func.f_ff(s[j], s2[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - //Verify data + // Verify data t = (cl_ulong *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_ulong *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - cl_double test = ((cl_double*) q)[j]; - long double correct = func.f_ff( s[j], s2[j] ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - int fail = ! (fabsf(err) <= ulps); + cl_double test = ((cl_double *)q)[j]; + long double correct = func.f_ff(s[j], s2[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= ulps); - if( fail && ftz ) + if (fail && ftz) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, ulps ) ) + if (IsDoubleResultSubnormal(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // nextafter on FTZ platforms may return the smallest @@ -1426,103 +1836,113 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) // normal number is the next representable number. // In which case, it should have the same sign as the // second argument. - if (isNextafter ) + if (isNextafter) { - if(IsDoubleSubnormal(s[j]) || s[j] == 0.0f) + if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f) { cl_double value = copysign(twoToMinus1022, s2[j]); fail = fail && (test != value); - if (!fail) - err = 0.0f; + if (!fail) err = 0.0f; } } else { // retry per section 6.5.3.3 - if( IsDoubleSubnormal( s[j] ) ) + if (IsDoubleSubnormal(s[j])) { - long double correct2 = func.f_ff( 0.0, s2[j] ); - long double correct3 = func.f_ff( -0.0, s2[j] ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = func.f_ff(0.0, s2[j]); + long double correct3 = func.f_ff(-0.0, s2[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } - //try with both args as zero - if( IsDoubleSubnormal( s2[j] ) ) + // try with both args as zero + if (IsDoubleSubnormal(s2[j])) { - correct2 = func.f_ff( 0.0, 0.0 ); - correct3 = func.f_ff( -0.0, 0.0 ); - long double correct4 = func.f_ff( 0.0, -0.0 ); - long double correct5 = func.f_ff( -0.0, -0.0 ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) && - (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + correct2 = func.f_ff(0.0, 0.0); + correct3 = func.f_ff(-0.0, 0.0); + long double correct4 = func.f_ff(0.0, -0.0); + long double correct5 = func.f_ff(-0.0, -0.0); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) || - IsDoubleResultSubnormal( correct4, ulps ) || IsDoubleResultSubnormal( correct5, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps) + || IsDoubleResultSubnormal(correct4, ulps) + || IsDoubleResultSubnormal(correct5, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - else if(IsDoubleSubnormal(s2[j]) ) + else if (IsDoubleSubnormal(s2[j])) { - long double correct2 = func.f_ff( s[j], 0.0 ); - long double correct3 = func.f_ff( s[j], -0.0 ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = func.f_ff(s[j], 0.0); + long double correct3 = func.f_ff(s[j], -0.0); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; tinfo->maxErrorValue2 = s2[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at {%.13la, %.13la}: *%.13la vs. %.13la\n", name, sizeNames[k], err, s[j], s2[j], r[j], test ); + vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, " + "%.13la}: *%.13la vs. %.13la\n", + name, sizeNames[k], err, s[j], s2[j], r[j], + test); error = -1; goto exit; } @@ -1530,33 +1950,37 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else { - vlog("." ); + vlog("."); } fflush(stdout); } exit: return error; - } int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) @@ -1580,4 +2004,3 @@ int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d, { return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode); } - diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp index abcb1b00..f6ba838a 100644 --- a/test_conformance/math_brute_force/binaryOperator.cpp +++ b/test_conformance/math_brute_force/binaryOperator.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -38,63 +38,85 @@ static int BuildKernel(const char *name, const char *operator_symbol, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = in1[i] ", operator_symbol, " in2[i];\n" - "}\n" - }; - const char *c3[] = { "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " f0 = f0 ", operator_symbol, " f1;\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 f0, f1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = f0 ", operator_symbol, " f1;\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "__kernel void ", + name, + "_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = in1[i] ", + operator_symbol, + " in2[i];\n" + "}\n" }; + const char *c3[] = { + "__kernel void ", + name, + "_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in, __global float* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 f1 = vload3( 0, in2 + 3 * i );\n" + " f0 = f0 ", + operator_symbol, + " f1;\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0, f1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " f1 = (float3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = f0 ", + operator_symbol, + " f1;\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "%s_kernel%s", name, sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name, + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -104,65 +126,87 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = in1[i] ", operator_symbol, " in2[i];\n" - "}\n" - }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void ", name, "_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " d0 = d0 ", operator_symbol, " d1;\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 d0, d1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = d0 ", operator_symbol, " d1;\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void ", + name, + "_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = in1[i] ", + operator_symbol, + " in2[i];\n" + "}\n" }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "__kernel void ", + name, + "_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global double* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " d0 = d0 ", + operator_symbol, + " d1;\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0, d1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = d0 ", + operator_symbol, + " d1;\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "%s_kernel%s", name, sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name, + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -170,114 +214,214 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol, typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *name; - const char *operator_symbol; + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *name; + const char *operator_symbol; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->name, info->operator_symbol, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->name, info->operator_symbol, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -//Thread specific data for a worker thread +// Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread - float maxError; // max error value. Init to 0. - double maxErrorValue; // position of the max error value (param 1). Init to 0. - double maxErrorValue2; // position of the max error value (param 2). Init to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -}ThreadInfo; + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + double maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; typedef struct TestInfo { - size_t subBufferSize; // Size of the sub-buffer in elements - const Func *f; // A pointer to the function info - cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes - cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id] - ThreadInfo *tinfo; // An array of thread specific information for each worker thread - cl_uint threadCount; // Number of worker threads - cl_uint jobCount; // Number of jobs - cl_uint step; // step between each chunk and the next. - cl_uint scale; // stride between individual test values - float ulps; // max_allowed ulps - int ftz; // non-zero if running in flush to zero mode + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode bool relaxedMode; // True if the test is being run in relaxed mode, false // otherwise. // no special fields -}TestInfo; +} TestInfo; -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); // A table of more difficult cases to get right static const float specialValuesFloat[] = { - -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), - MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f, -4.0f, -3.5f, - -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), - MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), - MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), - MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f, + -NAN, + -INFINITY, + -FLT_MAX, + MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), + MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), + MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), + MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), + MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), + MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), + MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), + MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), + MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), + MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), + MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), + MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), + -1000.f, + -100.f, + -4.0f, + -3.5f, + -3.0f, + MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), + -2.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), + -2.0f, + MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), + -1.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), + MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), + -1.0f, + MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), + MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), + -0.5f, + MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), + MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), + -0.25f, + MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), + MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), + -FLT_MIN, + MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), + MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), + MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), + MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), + MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), + MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), + MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), + MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), + MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), + MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), + -0.0f, - +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), - MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f, - +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), - MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), - MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), - MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f + +NAN, + +INFINITY, + +FLT_MAX, + MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), + MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), + MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), + MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), + MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), + MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), + MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), + MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), + MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), + MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), + MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), + MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), + +1000.f, + +100.f, + +4.0f, + +3.5f, + +3.0f, + MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), + 2.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), + +2.0f, + MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), + 1.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), + MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), + +1.0f, + MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), + MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), + +0.5f, + MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), + MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), + +0.25f, + MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), + MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), + +FLT_MIN, + MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), + MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), + MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), + MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), + MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), + MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), + MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), + MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), + MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), + MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), + +0.0f }; -static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] ); +static size_t specialValuesFloatCount = + sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_float)); - if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } test_info.step = test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -287,59 +431,80 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d, test_info.f = f; test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); test_info.relaxedMode = relaxedMode; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } @@ -355,18 +520,20 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d, f->name, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; @@ -374,110 +541,130 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d, } } - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) { p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000; p2[j] = 0x3fc00000; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; @@ -485,12 +672,12 @@ exit: static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_float ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - fptr func = job->f->func; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + fptr func = job->f->func; bool relaxedMode = job->relaxedMode; float ulps = getAllowedUlpError(job->f, relaxedMode); if (relaxedMode) @@ -499,74 +686,77 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) } - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - cl_uchar *overflow = (cl_uchar*)malloc(buffer_size); - const char *name = job->f->name; - cl_uint *t; - cl_float *r,*s,*s2; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + cl_uchar *overflow = (cl_uchar *)malloc(buffer_size); + const char *name = job->f->name; + cl_uint *t; + cl_float *r, *s, *s2; RoundingMode oldRoundMode; // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_uint *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_uint *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (uint32_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount; + int totalSpecialValueCount = + specialValuesFloatCount * specialValuesFloatCount; int indx = (totalSpecialValueCount - 1) / buffer_elements; - if( job_id <= (cl_uint)indx ) { + if (job_id <= (cl_uint)indx) + { // Insert special values uint32_t x, y; x = (job_id * buffer_elements) % specialValuesFloatCount; y = (job_id * buffer_elements) / specialValuesFloatCount; - for( ; j < buffer_elements; j++ ) { + for (; j < buffer_elements; j++) + { p[j] = ((cl_uint *)specialValuesFloat)[x]; p2[j] = ((cl_uint *)specialValuesFloat)[y]; ++x; - if (x >= specialValuesFloatCount) { + if (x >= specialValuesFloatCount) + { x = 0; y++; - if (y >= specialValuesFloatCount) - break; + if (y >= specialValuesFloatCount) break; } if (relaxedMode && strcmp(name, "divide") == 0) { cl_uint pj = p[j] & 0x7fffffff; cl_uint p2j = p2[j] & 0x7fffffff; // Replace values outside [2^-62, 2^62] with QNaN - if (pj < 0x20800000 || pj > 0x5e800000) - p[j] = 0x7fc00000; - if (p2j < 0x20800000 || p2j > 0x5e800000) - p2[j] = 0x7fc00000; + if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000; + if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000; } } } // Init any remaining values. - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); @@ -576,316 +766,353 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) cl_uint pj = p[j] & 0x7fffffff; cl_uint p2j = p2[j] & 0x7fffffff; // Replace values outside [2^-62, 2^62] with QNaN - if (pj < 0x20800000 || pj > 0x5e800000) - p[j] = 0x7fc00000; - if (p2j < 0x20800000 || p2j > 0x5e800000) - p2[j] = 0x7fc00000; + if (pj < 0x20800000 || pj > 0x5e800000) p[j] = 0x7fc00000; + if (p2j < 0x20800000 || p2j > 0x5e800000) p2[j] = 0x7fc00000; } } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) + if (gSkipCorrectnessTesting) { - free( overflow ); + free(overflow); return CL_SUCCESS; } - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result FPU_mode_type oldMode; - memset( &oldMode, 0, sizeof( oldMode ) ); - if( ftz ) - ForceFTZ( &oldMode ); + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); // Set the rounding mode to match the device oldRoundMode = kRoundToNearestEven; - if (gIsInRTZMode) - oldRoundMode = set_round(kRoundTowardZero, kfloat); + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); - //Calculate the correctly rounded reference result - r = (float *)gOut_Ref + thread_id * buffer_elements; - s = (float *)gIn + thread_id * buffer_elements; - s2 = (float *)gIn2 + thread_id * buffer_elements; - if( gInfNanSupport ) + // Calculate the correctly rounded reference result + r = (float *)gOut_Ref + thread_id * buffer_elements; + s = (float *)gIn + thread_id * buffer_elements; + s2 = (float *)gIn2 + thread_id * buffer_elements; + if (gInfNanSupport) { - for( j = 0; j < buffer_elements; j++ ) - r[j] = (float) func.f_ff( s[j], s2[j] ); + for (j = 0; j < buffer_elements; j++) + r[j] = (float)func.f_ff(s[j], s2[j]); } else { - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { feclearexcept(FE_OVERFLOW); - r[j] = (float) func.f_ff( s[j], s2[j] ); - overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + r[j] = (float)func.f_ff(s[j], s2[j]); + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); } } - if (gIsInRTZMode) - (void)set_round(oldRoundMode, kfloat); + if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); - if( ftz ) - RestoreFPState( &oldMode ); + if (ftz) RestoreFPState(&oldMode); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (uint32_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - //Verify data + // Verify data t = (cl_uint *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_uint *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - float test = ((float*) q)[j]; - double correct = func.f_ff( s[j], s2[j] ); + float test = ((float *)q)[j]; + double correct = func.f_ff(s[j], s2[j]); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if ( !gInfNanSupport) + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (!gInfNanSupport) { - // Note: no double rounding here. Reference functions calculate in single precision. - if( overflow[j] || - IsFloatInfinity(correct) || IsFloatNaN(correct) || - IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) || - IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) ) + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (overflow[j] || IsFloatInfinity(correct) + || IsFloatNaN(correct) || IsFloatInfinity(s2[j]) + || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j]) + || IsFloatNaN(s[j])) continue; } - // Per section 10 paragraph 6, accept embedded devices always returning positive 0.0. - if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000)) continue; + // Per section 10 paragraph 6, accept embedded devices always + // returning positive 0.0. + if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000)) + continue; - float err = Ulp_Error( test, correct ); - float errB = Ulp_Error( test, (float) correct ); + float err = Ulp_Error(test, correct); + float errB = Ulp_Error(test, (float)correct); - int fail = ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps))); - if( fabsf( errB ) < fabsf(err ) ) - err = errB; + int fail = + ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps))); + if (fabsf(errB) < fabsf(err)) err = errB; - if( fail && ftz ) + if (fail && ftz) { // retry per section 6.5.3.2 - if( IsFloatResultSubnormal(correct, ulps ) ) + if (IsFloatResultSubnormal(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsFloatSubnormal( s[j] ) ) + if (IsFloatSubnormal(s[j])) { double correct2, correct3; float err2, err3; - if( !gInfNanSupport ) - feclearexcept(FE_OVERFLOW); + if (!gInfNanSupport) feclearexcept(FE_OVERFLOW); - correct2 = func.f_ff( 0.0, s2[j] ); - correct3 = func.f_ff( -0.0, s2[j] ); + correct2 = func.f_ff(0.0, s2[j]); + correct3 = func.f_ff(-0.0, s2[j]); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if( !gInfNanSupport ) + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) { - if( fetestexcept(FE_OVERFLOW) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ) + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } - //try with both args as zero - if( IsFloatSubnormal( s2[j] ) ) + // try with both args as zero + if (IsFloatSubnormal(s2[j])) { double correct4, correct5; float err4, err5; - if( !gInfNanSupport ) - feclearexcept(FE_OVERFLOW); + if (!gInfNanSupport) feclearexcept(FE_OVERFLOW); - correct2 = func.f_ff( 0.0, 0.0 ); - correct3 = func.f_ff( -0.0, 0.0 ); - correct4 = func.f_ff( 0.0, -0.0 ); - correct5 = func.f_ff( -0.0, -0.0 ); + correct2 = func.f_ff(0.0, 0.0); + correct3 = func.f_ff(-0.0, 0.0); + correct4 = func.f_ff(0.0, -0.0); + correct5 = func.f_ff(-0.0, -0.0); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if( !gInfNanSupport ) + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) { - if( fetestexcept(FE_OVERFLOW) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) || - IsFloatInfinity(correct4) || IsFloatNaN(correct4) || - IsFloatInfinity(correct5) || IsFloatNaN(correct5) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - err4 = Ulp_Error( test, correct4 ); - err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) && - (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + err4 = Ulp_Error(test, correct4); + err5 = Ulp_Error(test, correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) || - IsFloatResultSubnormal( correct4, ulps ) || IsFloatResultSubnormal( correct5, ulps ) ) + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, ulps) + || IsFloatResultSubnormal(correct4, ulps) + || IsFloatResultSubnormal(correct5, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - else if(IsFloatSubnormal(s2[j]) ) + else if (IsFloatSubnormal(s2[j])) { double correct2, correct3; float err2, err3; - if( !gInfNanSupport ) - feclearexcept(FE_OVERFLOW); + if (!gInfNanSupport) feclearexcept(FE_OVERFLOW); - correct2 = func.f_ff( s[j], 0.0 ); - correct3 = func.f_ff( s[j], -0.0 ); + correct2 = func.f_ff(s[j], 0.0); + correct3 = func.f_ff(s[j], -0.0); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if ( !gInfNanSupport) + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) { - // Note: no double rounding here. Reference functions calculate in single precision. - if( overflow[j] || - IsFloatInfinity(correct) || IsFloatNaN(correct) || - IsFloatInfinity(correct2)|| IsFloatNaN(correct2) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (overflow[j] || IsFloatInfinity(correct) + || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ) + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; tinfo->maxErrorValue2 = s2[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a (0x%8.8x) at index: %d\n", name, sizeNames[k], err, s[j], s2[j], r[j], test, ((cl_uint*)&test)[0], j ); + vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a " + "vs. %a (0x%8.8x) at index: %d\n", + name, sizeNames[k], err, s[j], s2[j], r[j], test, + ((cl_uint *)&test)[0], j); error = -1; goto exit; } @@ -893,85 +1120,185 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else { - vlog("." ); + vlog("."); } fflush(stdout); } exit: - if( overflow ) - free( overflow ); + if (overflow) free(overflow); return error; } // A table of more difficult cases to get right static const double specialValuesDouble[] = { - -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100., -4.0, -3.5, - -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0, + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), + MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), + -1000., + -100., + -4.0, + -3.5, + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), + -0.5, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), + -0.25, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, - +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100., +4.0, +3.5, - +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0, + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), + MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), + +1000., + +100., + +4.0, + +3.5, + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), + +0.5, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), + +0.25, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, }; -static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] ); +static size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_double)); if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -983,56 +1310,76 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, test_info.ulps = f->double_ulps; test_info.ftz = f->ftz || gForceFTZ; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } @@ -1049,18 +1396,20 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, f->name, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; @@ -1068,387 +1417,441 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, } } - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays double *p = (double *)gIn; double *p2 = (double *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = DoubleFromUInt32(genrand_int32(d)); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); exit: // Release - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_double ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - dptr func = job->f->dfunc; - int ftz = job->ftz; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + dptr func = job->f->dfunc; + int ftz = job->ftz; bool relaxedMode = job->relaxedMode; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_ulong *t; - cl_double *r,*s,*s2; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_ulong *t; + cl_double *r, *s, *s2; Force64BitFPUPrecision(); // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_ulong *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_ulong *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements; cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount; + int totalSpecialValueCount = + specialValuesDoubleCount * specialValuesDoubleCount; int indx = (totalSpecialValueCount - 1) / buffer_elements; - if( job_id <= (cl_uint)indx ) + if (job_id <= (cl_uint)indx) { // test edge cases cl_double *fp = (cl_double *)p; cl_double *fp2 = (cl_double *)p2; uint32_t x, y; - x = (job_id * buffer_elements) % specialValuesDoubleCount; - y = (job_id * buffer_elements) / specialValuesDoubleCount; + x = (job_id * buffer_elements) % specialValuesDoubleCount; + y = (job_id * buffer_elements) / specialValuesDoubleCount; - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { fp[j] = specialValuesDouble[x]; fp2[j] = specialValuesDouble[y]; - if( ++x >= specialValuesDoubleCount ) + if (++x >= specialValuesDoubleCount) { x = 0; y++; - if( y >= specialValuesDoubleCount ) - break; + if (y >= specialValuesDoubleCount) break; } } } - //Init any remaining values. - for( ; j < buffer_elements; j++ ) + // Init any remaining values. + for (; j < buffer_elements; j++) { p[j] = genrand_int64(d); p2[j] = genrand_int64(d); } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result - r = (cl_double *)gOut_Ref + thread_id * buffer_elements; - s = (cl_double *)gIn + thread_id * buffer_elements; - s2 = (cl_double *)gIn2 + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - r[j] = (cl_double) func.f_ff( s[j], s2[j] ); + // Calculate the correctly rounded reference result + r = (cl_double *)gOut_Ref + thread_id * buffer_elements; + s = (cl_double *)gIn + thread_id * buffer_elements; + s2 = (cl_double *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + r[j] = (cl_double)func.f_ff(s[j], s2[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - //Verify data + // Verify data t = (cl_ulong *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_ulong *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - cl_double test = ((cl_double*) q)[j]; - long double correct = func.f_ff( s[j], s2[j] ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - int fail = ! (fabsf(err) <= ulps); + cl_double test = ((cl_double *)q)[j]; + long double correct = func.f_ff(s[j], s2[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= ulps); - if( fail && ftz ) + if (fail && ftz) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, ulps ) ) + if (IsDoubleResultSubnormal(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsDoubleSubnormal( s[j] ) ) + if (IsDoubleSubnormal(s[j])) { - long double correct2 = func.f_ff( 0.0, s2[j] ); - long double correct3 = func.f_ff( -0.0, s2[j] ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = func.f_ff(0.0, s2[j]); + long double correct3 = func.f_ff(-0.0, s2[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } - //try with both args as zero - if( IsDoubleSubnormal( s2[j] ) ) + // try with both args as zero + if (IsDoubleSubnormal(s2[j])) { - correct2 = func.f_ff( 0.0, 0.0 ); - correct3 = func.f_ff( -0.0, 0.0 ); - long double correct4 = func.f_ff( 0.0, -0.0 ); - long double correct5 = func.f_ff( -0.0, -0.0 ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)) && - (!(fabsf(err4) <= ulps)) && (!(fabsf(err5) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + correct2 = func.f_ff(0.0, 0.0); + correct3 = func.f_ff(-0.0, 0.0); + long double correct4 = func.f_ff(0.0, -0.0); + long double correct5 = func.f_ff(-0.0, -0.0); + err2 = Bruteforce_Ulp_Error_Double(test, correct2); + err3 = Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) || - IsDoubleResultSubnormal( correct4, ulps ) || IsDoubleResultSubnormal( correct5, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps) + || IsDoubleResultSubnormal(correct4, ulps) + || IsDoubleResultSubnormal(correct5, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - else if(IsDoubleSubnormal(s2[j]) ) + else if (IsDoubleSubnormal(s2[j])) { - long double correct2 = func.f_ff( s[j], 0.0 ); - long double correct3 = func.f_ff( s[j], -0.0 ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = func.f_ff(s[j], 0.0); + long double correct3 = func.f_ff(s[j], -0.0); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; tinfo->maxErrorValue2 = s2[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n", name, sizeNames[k], err, s[j], s2[j], r[j], test ); + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n", + name, sizeNames[k], err, s[j], s2[j], r[j], test); error = -1; goto exit; } @@ -1456,36 +1859,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else { - vlog("." ); + vlog("."); } fflush(stdout); } exit: return error; - } - - - - diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp index 01f45242..dc6feb8c 100644 --- a/test_conformance/math_brute_force/binary_i.cpp +++ b/test_conformance/math_brute_force/binary_i.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -34,64 +34,83 @@ static int BuildKernelDouble(const char *name, int vectorSize, static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global int", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i] );\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global int", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global int* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = vload3( 0, in2 + 3 * i );\n" - " f0 = ", name, "( f0, i0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 f0;\n" - " int3 i0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0, i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in, __global int* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = vload3( 0, in2 + 3 * i );\n" + " f0 = ", + name, + "( f0, i0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " int3 i0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -101,66 +120,85 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global int", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i] );\n" - "}\n" - }; + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global int", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global int* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = vload3( 0, in2 + 3 * i );\n" - " d0 = ", name, "( d0, i0 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 d0;\n" - " int3 i0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " d0 = ", name, "( d0, i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global int* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = vload3( 0, in2 + 3 * i );\n" + " d0 = ", + name, + "( d0, i0 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0;\n" + " int3 i0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -168,27 +206,31 @@ static int BuildKernelDouble(const char *name, int vectorSize, typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, @@ -198,85 +240,185 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo // A table of more difficult cases to get right static const float specialValuesFloat[] = { - -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), - MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f, -4.0f, -3.5f, - -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), - MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), - MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), - MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f, + -NAN, + -INFINITY, + -FLT_MAX, + MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), + MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), + MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), + MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), + MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), + MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), + MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), + MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), + MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), + MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), + MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), + MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), + -1000.f, + -100.f, + -4.0f, + -3.5f, + -3.0f, + MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), + -2.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), + -2.0f, + MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), + -1.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), + MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), + -1.0f, + MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), + MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), + -0.5f, + MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), + MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), + -0.25f, + MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), + MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), + -FLT_MIN, + MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), + MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), + MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), + MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), + MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), + MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), + MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), + MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), + MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), + MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), + -0.0f, - +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), - MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f, - +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), - MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), - MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), - MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f + +NAN, + +INFINITY, + +FLT_MAX, + MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), + MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), + MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), + MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), + MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), + MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), + MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), + MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), + MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), + MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), + MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), + MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), + +1000.f, + +100.f, + +4.0f, + +3.5f, + +3.0f, + MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), + 2.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), + +2.0f, + MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), + 1.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), + MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), + +1.0f, + MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), + MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), + +0.5f, + MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), + MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), + +0.25f, + MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), + MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), + +FLT_MIN, + MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), + MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), + MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), + MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), + MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), + MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), + MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), + MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), + MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), + MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), + +0.0f }; -static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] ); +static size_t specialValuesFloatCount = + sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); -static const int specialValuesInt[] = { 0, 1, 2, 3, 126, 127, 128, 0x02000001, 0x04000001, 1465264071, 1488522147, - -1, -2, -3, -126, -127, -128, -0x02000001, -0x04000001, -1465264071, -1488522147 }; -static size_t specialValuesIntCount = sizeof( specialValuesInt ) / sizeof( specialValuesInt[0] ); +static const int specialValuesInt[] = { + 0, 1, 2, 3, 126, 127, + 128, 0x02000001, 0x04000001, 1465264071, 1488522147, -1, + -2, -3, -126, -127, -128, -0x02000001, + -0x04000001, -1465264071, -1488522147 +}; +static size_t specialValuesIntCount = + sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); -//Thread specific data for a worker thread +// Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread - float maxError; // max error value. Init to 0. - double maxErrorValue; // position of the max error value (param 1). Init to 0. - cl_int maxErrorValue2; // position of the max error value (param 2). Init to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -}ThreadInfo; + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + cl_int maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; typedef struct TestInfo { - size_t subBufferSize; // Size of the sub-buffer in elements - const Func *f; // A pointer to the function info - cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes - cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id] - ThreadInfo *tinfo; // An array of thread specific information for each worker thread - cl_uint threadCount; // Number of worker threads - cl_uint jobCount; // Number of jobs - cl_uint step; // step between each chunk and the next. - cl_uint scale; // stride between individual test values - float ulps; // max_allowed ulps - int ftz; // non-zero if running in flush to zero mode + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode // no special values -}TestInfo; +} TestInfo; -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - cl_int maxErrorVal2 = 0; + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_float)); if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -286,59 +428,82 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) test_info.f = f; test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - cl_buffer_region region2 = { i * test_info.subBufferSize * sizeof( cl_int), test_info.subBufferSize * sizeof( cl_int) }; - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } test_info.tinfo[i].d = init_genrand(genrand_int32(d)); @@ -350,18 +515,20 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } // Run the kernels - error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; @@ -369,331 +536,377 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) } } - if( error ) - goto exit; + if (error) goto exit; - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) { p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000; p2[j] = 3; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_float ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - fptr func = job->f->func; - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_uint *t; - cl_float *r,*s; - cl_int *s2; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_uint *t; + cl_float *r, *s; + cl_int *s2; // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_uint *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_uint *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (uint32_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesFloatCount * specialValuesIntCount; + int totalSpecialValueCount = + specialValuesFloatCount * specialValuesIntCount; int indx = (totalSpecialValueCount - 1) / buffer_elements; - if( job_id <= (cl_uint)indx ) + if (job_id <= (cl_uint)indx) { // test edge cases float *fp = (float *)p; cl_int *ip2 = (cl_int *)p2; uint32_t x, y; - x = (job_id * buffer_elements) % specialValuesFloatCount; - y = (job_id * buffer_elements) / specialValuesFloatCount; + x = (job_id * buffer_elements) % specialValuesFloatCount; + y = (job_id * buffer_elements) / specialValuesFloatCount; - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { - fp[j] = specialValuesFloat[x]; - ip2[j] = specialValuesInt[y]; - if( ++x >= specialValuesFloatCount ) + fp[j] = specialValuesFloat[x]; + ip2[j] = specialValuesInt[y]; + if (++x >= specialValuesFloatCount) { x = 0; y++; - if( y >= specialValuesIntCount ) - break; + if (y >= specialValuesIntCount) break; } } } - //Init any remaining values. - for( ; j < buffer_elements; j++ ) + // Init any remaining values. + for (; j < buffer_elements; j++) { - p[j] = genrand_int32(d); - p2[j] = genrand_int32(d); + p[j] = genrand_int32(d); + p2[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result - r = (float *)gOut_Ref + thread_id * buffer_elements; - s = (float *)gIn + thread_id * buffer_elements; - s2 = (cl_int *)gIn2 + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - r[j] = (float) func.f_fi( s[j], s2[j] ); + // Calculate the correctly rounded reference result + r = (float *)gOut_Ref + thread_id * buffer_elements; + s = (float *)gIn + thread_id * buffer_elements; + s2 = (cl_int *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (uint32_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - //Verify data + // Verify data t = (cl_uint *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_uint *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - float test = ((float*) q)[j]; - double correct = func.f_fi( s[j], s2[j] ); - float err = Ulp_Error( test, correct ); - int fail = ! (fabsf(err) <= ulps); + float test = ((float *)q)[j]; + double correct = func.f_fi(s[j], s2[j]); + float err = Ulp_Error(test, correct); + int fail = !(fabsf(err) <= ulps); - if( fail && ftz ) + if (fail && ftz) { // retry per section 6.5.3.2 - if( IsFloatResultSubnormal(correct, ulps ) ) + if (IsFloatResultSubnormal(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsFloatSubnormal( s[j] ) ) + if (IsFloatSubnormal(s[j])) { double correct2, correct3; float err2, err3; - correct2 = func.f_fi( 0.0, s2[j] ); - correct3 = func.f_fi( -0.0, s2[j] ); - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + correct2 = func.f_fi(0.0, s2[j]); + correct3 = func.f_fi(-0.0, s2[j]); + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal( correct2, ulps ) || IsFloatResultSubnormal( correct3, ulps ) ) + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; tinfo->maxErrorValue2 = s2[j]; } - if( fail ) + if (fail) { vlog_error( "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: " @@ -708,89 +921,191 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else { - vlog("." ); + vlog("."); } fflush(stdout); } exit: return error; - } - // A table of more difficult cases to get right static const double specialValuesDouble[] = { - -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100., -4.0, -3.5, - -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0, + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), + MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), + -1000., + -100., + -4.0, + -3.5, + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), + -0.5, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), + -0.25, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, - +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100., +4.0, +3.5, - +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0, + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), + MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), + +1000., + +100., + +4.0, + +3.5, + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), + +0.5, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), + +0.25, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, }; -static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] ); +static size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); -static const int specialValuesInt2[] = { 0, 1, 2, 3, 1022, 1023, 1024, INT_MIN, INT_MAX, - -1, -2, -3, -1022, -1023, -11024, -INT_MAX }; -static size_t specialValuesInt2Count = sizeof( specialValuesInt ) / sizeof( specialValuesInt[0] ); +static const int specialValuesInt2[] = { 0, 1, 2, 3, + 1022, 1023, 1024, INT_MIN, + INT_MAX, -1, -2, -3, + -1022, -1023, -11024, -INT_MAX }; +static size_t specialValuesInt2Count = + sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - cl_int maxErrorVal2 = 0; + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_double)); if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -802,59 +1117,82 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode) test_info.ulps = f->double_ulps; test_info.ftz = f->ftz || gForceFTZ; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - cl_buffer_region region2 = { i * test_info.subBufferSize * sizeof( cl_int), test_info.subBufferSize * sizeof( cl_int) }; - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */ - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + /* Qualcomm fix: 9461 read-write flags must be compatible with + * parent buffer */ + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); /* Qualcomm fix: end */ - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } @@ -868,19 +1206,21 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } // Run the kernels - if( !gSkipCorrectnessTesting ) - error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info ); + if (!gSkipCorrectnessTesting) + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; @@ -888,334 +1228,386 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode) } } - if( error ) - goto exit; + if (error) goto exit; - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays double *p = (double *)gIn; cl_int *p2 = (cl_int *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = 3; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE/2, gIn2, 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE / 2, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); exit: // Release - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_double ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - dptr func = job->f->dfunc; - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_ulong *t; - cl_double *r,*s; - cl_int *s2; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + dptr func = job->f->dfunc; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_ulong *t; + cl_double *r, *s; + cl_int *s2; Force64BitFPUPrecision(); // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_ulong *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_ulong *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements; cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesDoubleCount * specialValuesInt2Count; + int totalSpecialValueCount = + specialValuesDoubleCount * specialValuesInt2Count; int indx = (totalSpecialValueCount - 1) / buffer_elements; - if( job_id <= (cl_uint)indx ) + if (job_id <= (cl_uint)indx) { // test edge cases cl_double *fp = (cl_double *)p; cl_int *ip2 = (cl_int *)p2; uint32_t x, y; - x = (job_id * buffer_elements) % specialValuesDoubleCount; - y = (job_id * buffer_elements) / specialValuesDoubleCount; + x = (job_id * buffer_elements) % specialValuesDoubleCount; + y = (job_id * buffer_elements) / specialValuesDoubleCount; - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { fp[j] = specialValuesDouble[x]; ip2[j] = specialValuesInt2[y]; - if( ++x >= specialValuesDoubleCount ) + if (++x >= specialValuesDoubleCount) { x = 0; y++; - if( y >= specialValuesInt2Count ) - break; + if (y >= specialValuesInt2Count) break; } } } - //Init any remaining values. - for( ; j < buffer_elements; j++ ) + // Init any remaining values. + for (; j < buffer_elements; j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size/2, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size / 2, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result - r = (cl_double *)gOut_Ref + thread_id * buffer_elements; - s = (cl_double *)gIn + thread_id * buffer_elements; - s2 = (cl_int *)gIn2 + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - r[j] = (cl_double) func.f_fi( s[j], s2[j] ); + // Calculate the correctly rounded reference result + r = (cl_double *)gOut_Ref + thread_id * buffer_elements; + s = (cl_double *)gIn + thread_id * buffer_elements; + s2 = (cl_int *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + r[j] = (cl_double)func.f_fi(s[j], s2[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - //Verify data + // Verify data t = (cl_ulong *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_ulong *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - cl_double test = ((cl_double*) q)[j]; - long double correct = func.f_fi( s[j], s2[j] ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - int fail = ! (fabsf(err) <= ulps); + cl_double test = ((cl_double *)q)[j]; + long double correct = func.f_fi(s[j], s2[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= ulps); - if( fail && ftz ) + if (fail && ftz) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, ulps ) ) + if (IsDoubleResultSubnormal(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsDoubleSubnormal( s[j] ) ) + if (IsDoubleSubnormal(s[j])) { - long double correct2 = func.f_fi( 0.0, s2[j] ); - long double correct3 = func.f_fi( -0.0, s2[j] ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = func.f_fi(0.0, s2[j]); + long double correct3 = func.f_fi(-0.0, s2[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, ulps ) || IsDoubleResultSubnormal( correct3, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; tinfo->maxErrorValue2 = s2[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at {%.13la, %d}: *%.13la vs. %.13la\n", name, sizeNames[k], err, s[j], s2[j], r[j], test ); + vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, %d}: " + "*%.13la vs. %.13la\n", + name, sizeNames[k], err, s[j], s2[j], r[j], + test); error = -1; goto exit; } @@ -1223,35 +1615,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else { - vlog("." ); - } - fflush(stdout); + vlog("."); + } + fflush(stdout); } exit: return error; - } - - - diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp index af1b04d1..5065b280 100644 --- a/test_conformance/math_brute_force/binary_two_results_i.cpp +++ b/test_conformance/math_brute_force/binary_two_results_i.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -36,68 +36,90 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i], out2 + i );\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global int", + sizeNames[vectorSize], + "* out2, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], out2 + i );\n" + "}\n" }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in, __global float* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " int3 i0 = 0xdeaddead;\n" - " f0 = ", name, "( f0, f1, &i0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( i0, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 f0, f1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " int3 i0 = 0xdeaddead;\n" - " f0 = ", name, "( f0, f1, &i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global int* out2, __global float* in, " + "__global float* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 f1 = vload3( 0, in2 + 3 * i );\n" + " int3 i0 = 0xdeaddead;\n" + " f0 = ", + name, + "( f0, f1, &i0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( i0, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0, f1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " f1 = (float3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " int3 i0 = 0xdeaddead;\n" + " f0 = ", + name, + "( f0, f1, &i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } @@ -106,95 +128,121 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i], out2 + i );\n" - "}\n" - }; + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global int", + sizeNames[vectorSize], + "* out2, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], out2 + i );\n" + "}\n" }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in, __global double* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " int3 i0 = 0xdeaddead;\n" - " d0 = ", name, "( d0, d1, &i0 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " vstore3( i0, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 d0, d1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " int3 i0 = 0xdeaddead;\n" - " d0 = ", name, "( d0, d1, &i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " out2[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " out2[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global int* out2, __global double* in, " + "__global double* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " int3 i0 = 0xdeaddead;\n" + " d0 = ", + name, + "( d0, d1, &i0 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " vstore3( i0, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0, d1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " int3 i0 = 0xdeaddead;\n" + " d0 = ", + name, + "( d0, d1, &i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " out2[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " out2[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); @@ -207,7 +255,7 @@ typedef struct ComputeReferenceInfoF_ const float *y; float *r; int *i; - double (*f_ffpI)(double, double, int*); + double (*f_ffpI)(double, double, int *); cl_uint lim; cl_uint count; } ComputeReferenceInfoF; @@ -218,13 +266,12 @@ typedef struct ComputeReferenceInfoD_ const double *y; double *r; int *i; - long double (*f_ffpI)(long double, long double, int*); + long double (*f_ffpI)(long double, long double, int *); cl_uint lim; cl_uint count; } ComputeReferenceInfoD; -static cl_int -ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) { ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; cl_uint lim = cri->lim; @@ -237,17 +284,15 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) double (*f)(double, double, int *) = cri->f_ffpI; cl_uint j; - if (off + count > lim) - count = lim - off; + if (off + count > lim) count = lim - off; for (j = 0; j < count; ++j) - r[j] = (float)f((double)x[j], (double)y[j], i + j); + r[j] = (float)f((double)x[j], (double)y[j], i + j); return CL_SUCCESS; } -static cl_int -ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) +static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) { ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo; cl_uint lim = cri->lim; @@ -260,13 +305,12 @@ ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) long double (*f)(long double, long double, int *) = cri->f_ffpI; cl_uint j; - if (off + count > lim) - count = lim - off; + if (off + count > lim) count = lim - off; Force64BitFPUPrecision(); for (j = 0; j < count; ++j) - r[j] = (double)f((long double)x[j], (long double)y[j], i + j); + r[j] = (double)f((long double)x[j], (long double)y[j], i + j); return CL_SUCCESS; } @@ -278,15 +322,15 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; float float_ulps; int64_t maxError2 = 0; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); float maxErrorVal = 0.0f; float maxErrorVal2 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(float), bufferSize); #if defined PARALLEL_REFERENCE @@ -294,7 +338,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) #endif logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - if( gIsEmbedded ) + if (gIsEmbedded) float_ulps = f->float_embedded_ulps; else float_ulps = f->float_ulps; @@ -305,392 +349,480 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) { BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) return error; } -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, + programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array cl_uint *p = (cl_uint *)gIn; cl_uint *p2 = (cl_uint *)gIn2; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } memset_pattern4(gOut2[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); // Calculate the correctly rounded reference result float *s = (float *)gIn; float *s2 = (float *)gIn2; #if defined PARALLEL_REFERENCE - if (threadCount > 1) { - ComputeReferenceInfoF cri; - cri.x = s; - cri.y = s2; - cri.r = (float *)gOut_Ref; - cri.i = (int *)gOut_Ref2; - cri.f_ffpI = f->func.f_ffpI; - cri.lim = bufferSize / sizeof( float ); - cri.count = (cri.lim + threadCount - 1) / threadCount; - ThreadPool_Do(ReferenceF, threadCount, &cri); - } else { + if (threadCount > 1) + { + ComputeReferenceInfoF cri; + cri.x = s; + cri.y = s2; + cri.r = (float *)gOut_Ref; + cri.i = (int *)gOut_Ref2; + cri.f_ffpI = f->func.f_ffpI; + cri.lim = bufferSize / sizeof(float); + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceF, threadCount, &cri); + } + else + { #endif float *r = (float *)gOut_Ref; int *r2 = (int *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - r[j] = (float) f->func.f_ffpI( s[j], s2[j], r2+j ); + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j); #if defined PARALLEL_REFERENCE - } + } #endif // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "ReadArray2 failed %d\n", error ); + vlog_error("ReadArray2 failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint32_t *t = (uint32_t *)gOut_Ref; int32_t *t2 = (int32_t *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = (uint32_t *)gOut[k]; int32_t *q2 = (int32_t *)gOut2[k]; // Check for exact match to correctly rounded result - if (t[j] == q[j] && t2[j] == q2[j]) - continue; + if (t[j] == q[j] && t2[j] == q2[j]) continue; - // Check for paired NaNs - if ((t[j] & 0x7fffffff) > 0x7f800000 && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j]) - continue; + // Check for paired NaNs + if ((t[j] & 0x7fffffff) > 0x7f800000 + && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j]) + continue; // if( t[j] != q[j] || t2[j] != q2[j] ) { - float test = ((float*) q)[j]; + float test = ((float *)q)[j]; int correct2 = INT_MIN; - double correct = f->func.f_ffpI( s[j], s2[j], &correct2 ); - float err = Ulp_Error( test, correct ); + double correct = f->func.f_ffpI(s[j], s2[j], &correct2); + float err = Ulp_Error(test, correct); int64_t iErr; - // in case of remquo, we only care about the sign and last seven bits of - // integer as per the spec. - if(testingRemquo) - iErr = (long long) (q2[j] & 0x0000007f) - (long long) (correct2 & 0x0000007f); + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); else - iErr = (long long) q2[j] - (long long) correct2; + iErr = (long long)q2[j] - (long long)correct2; - //For remquo, if y = 0, x is infinite, or either is NaN then the standard either neglects - //to say what is returned in iptr or leaves it undefined or implementation defined. - int iptrUndefined = fabs(((float*) gIn)[j]) == INFINITY || - ((float*) gIn2)[j] == 0.0f || - isnan(((float*) gIn2)[j]) || - isnan(((float*) gIn)[j]); - if(iptrUndefined) - iErr = 0; + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY + || ((float *)gIn2)[j] == 0.0f + || isnan(((float *)gIn2)[j]) + || isnan(((float *)gIn)[j]); + if (iptrUndefined) iErr = 0; - int fail = ! (fabsf(err) <= float_ulps && iErr == 0 ); - if( ftz && fail ) + int fail = !(fabsf(err) <= float_ulps && iErr == 0); + if (ftz && fail) { // retry per section 6.5.3.2 - if( IsFloatResultSubnormal(correct, float_ulps ) ) + if (IsFloatResultSubnormal(correct, float_ulps)) { - fail = fail && ! ( test == 0.0f && iErr == 0 ); - if( ! fail ) - err = 0.0f; + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsFloatSubnormal( s[j] ) ) + if (IsFloatSubnormal(s[j])) { int correct3i, correct4i; - double correct3 = f->func.f_ffpI( 0.0, s2[j], &correct3i ); - double correct4 = f->func.f_ffpI( -0.0, s2[j], &correct4i ); - float err2 = Ulp_Error( test, correct3 ); - float err3 = Ulp_Error( test, correct4 ); - int64_t iErr3 = (long long) q2[j] - (long long) correct3i; - int64_t iErr4 = (long long) q2[j] - (long long) correct4i; - fail = fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps && iErr4 == 0))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( llabs(iErr3) < llabs( iErr ) ) - iErr = iErr3; - if( llabs(iErr4) < llabs( iErr ) ) - iErr = iErr4; + double correct3 = + f->func.f_ffpI(0.0, s2[j], &correct3i); + double correct4 = + f->func.f_ffpI(-0.0, s2[j], &correct4i); + float err2 = Ulp_Error(test, correct3); + float err3 = Ulp_Error(test, correct4); + int64_t iErr3 = + (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = + (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= float_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps ) ) + if (IsFloatResultSubnormal(correct2, float_ulps) + || IsFloatResultSubnormal(correct3, float_ulps)) { - fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) ); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; } - //try with both args as zero - if( IsFloatSubnormal( s2[j] ) ) + // try with both args as zero + if (IsFloatSubnormal(s2[j])) { int correct7i, correct8i; - correct3 = f->func.f_ffpI( 0.0, 0.0, &correct3i ); - correct4 = f->func.f_ffpI( -0.0, 0.0, &correct4i ); - double correct7 = f->func.f_ffpI( 0.0, -0.0, &correct7i ); - double correct8 = f->func.f_ffpI( -0.0, -0.0, &correct8i ); - err2 = Ulp_Error( test, correct3 ); - err3 = Ulp_Error( test, correct4 ); - float err4 = Ulp_Error( test, correct7 ); - float err5 = Ulp_Error( test, correct8 ); - iErr3 = (long long) q2[j] - (long long) correct3i; - iErr4 = (long long) q2[j] - (long long) correct4i; - int64_t iErr7 = (long long) q2[j] - (long long) correct7i; - int64_t iErr8 = (long long) q2[j] - (long long) correct8i; - fail = fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps && iErr4 == 0)) && - (!(fabsf(err4) <= float_ulps && iErr7 == 0)) && (!(fabsf(err5) <= float_ulps && iErr8 == 0))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - if( llabs(iErr3) < llabs( iErr ) ) - iErr = iErr3; - if( llabs(iErr4) < llabs( iErr ) ) - iErr = iErr4; - if( llabs(iErr7) < llabs( iErr ) ) - iErr = iErr7; - if( llabs(iErr8) < llabs( iErr ) ) - iErr = iErr8; + correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i); + correct4 = + f->func.f_ffpI(-0.0, 0.0, &correct4i); + double correct7 = + f->func.f_ffpI(0.0, -0.0, &correct7i); + double correct8 = + f->func.f_ffpI(-0.0, -0.0, &correct8i); + err2 = Ulp_Error(test, correct3); + err3 = Ulp_Error(test, correct4); + float err4 = Ulp_Error(test, correct7); + float err5 = Ulp_Error(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= float_ulps + && iErr3 == 0)) + && (!(fabsf(err3) <= float_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= float_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= float_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct3, float_ulps ) || IsFloatResultSubnormal(correct4, float_ulps ) || - IsFloatResultSubnormal(correct7, float_ulps ) || IsFloatResultSubnormal(correct8, float_ulps ) ) + if (IsFloatResultSubnormal(correct3, float_ulps) + || IsFloatResultSubnormal(correct4, + float_ulps) + || IsFloatResultSubnormal(correct7, + float_ulps) + || IsFloatResultSubnormal(correct8, + float_ulps)) { - fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0 || iErr7 == 0 || iErr8 == 0)); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; } } } - else if( IsFloatSubnormal( s2[j] ) ) + else if (IsFloatSubnormal(s2[j])) { int correct3i, correct4i; - double correct3 = f->func.f_ffpI( s[j], 0.0, &correct3i ); - double correct4 = f->func.f_ffpI( s[j], -0.0, &correct4i ); - float err2 = Ulp_Error( test, correct3 ); - float err3 = Ulp_Error( test, correct4 ); - int64_t iErr3 = (long long) q2[j] - (long long) correct3i; - int64_t iErr4 = (long long) q2[j] - (long long) correct4i; - fail = fail && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) && (!(fabsf(err3) <= float_ulps && iErr4 == 0))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( llabs(iErr3) < llabs( iErr ) ) - iErr = iErr3; - if( llabs(iErr4) < llabs( iErr ) ) - iErr = iErr4; + double correct3 = + f->func.f_ffpI(s[j], 0.0, &correct3i); + double correct4 = + f->func.f_ffpI(s[j], -0.0, &correct4i); + float err2 = Ulp_Error(test, correct3); + float err3 = Ulp_Error(test, correct4); + int64_t iErr3 = + (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = + (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= float_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps ) ) + if (IsFloatResultSubnormal(correct2, float_ulps) + || IsFloatResultSubnormal(correct3, float_ulps)) { - fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) ); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; } - if( llabs(iErr) > maxError2 ) + if (llabs(iErr) > maxError2) { - maxError2 = llabs(iErr ); + maxError2 = llabs(iErr); maxErrorVal2 = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} ({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, 0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n", - f->name, sizeNames[k], err, iErr, - ((float*) gIn)[j], ((float*) gIn2)[j], - ((cl_uint*) gIn)[j], ((cl_uint*) gIn2)[j], - ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], - ((cl_uint*) gOut_Ref)[j], ((cl_uint*) gOut_Ref2)[j], - test, q2[j], - ((cl_uint*)&test)[0], ((cl_uint*) q2)[j] ); - error = -1; - goto exit; + vlog_error( + "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} " + "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, " + "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, ((float *)gIn)[j], + ((float *)gIn2)[j], ((cl_uint *)gIn)[j], + ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j], + ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j], + ((cl_uint *)gOut_Ref2)[j], test, q2[j], + ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]); + error = -1; + goto exit; } } } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); - + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) p[j] = genrand_int32(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -704,14 +836,14 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int64_t maxError2 = 0; int ftz = f->ftz || gForceFTZ; double maxErrorVal = 0.0f; double maxErrorVal2 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(double), bufferSize); logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); @@ -728,400 +860,504 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) { BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) { return error; } } -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + + i, programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array double *p = (double *)gIn; double *p2 = (double *)gIn2; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = DoubleFromUInt32(genrand_int32(d)); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } memset_pattern4(gOut2[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result double *s = (double *)gIn; double *s2 = (double *)gIn2; #if defined PARALLEL_REFERENCE - if (threadCount > 1) { - ComputeReferenceInfoD cri; - cri.x = s; - cri.y = s2; - cri.r = (double *)gOut_Ref; - cri.i = (int *)gOut_Ref2; - cri.f_ffpI = f->dfunc.f_ffpI; - cri.lim = bufferSize / sizeof( double ); - cri.count = (cri.lim + threadCount - 1) / threadCount; - ThreadPool_Do(ReferenceD, threadCount, &cri); - } else { + if (threadCount > 1) + { + ComputeReferenceInfoD cri; + cri.x = s; + cri.y = s2; + cri.r = (double *)gOut_Ref; + cri.i = (int *)gOut_Ref2; + cri.f_ffpI = f->dfunc.f_ffpI; + cri.lim = bufferSize / sizeof(double); + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceD, threadCount, &cri); + } + else + { #endif double *r = (double *)gOut_Ref; int *r2 = (int *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - r[j] = (double) f->dfunc.f_ffpI( s[j], s2[j], r2+j ); + for (j = 0; j < bufferSize / sizeof(double); j++) + r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j); #if defined PARALLEL_REFERENCE - } + } #endif // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "ReadArray2 failed %d\n", error ); + vlog_error("ReadArray2 failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint64_t *t = (uint64_t *)gOut_Ref; int32_t *t2 = (int32_t *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint64_t *q = (uint64_t *)gOut[k]; int32_t *q2 = (int32_t *)gOut2[k]; - // Check for exact match to correctly rounded result - if (t[j] == q[j] && t2[j] == q2[j]) - continue; + // Check for exact match to correctly rounded result + if (t[j] == q[j] && t2[j] == q2[j]) continue; - // Check for paired NaNs - if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL && - (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL && - t2[j] == q2[j]) - continue; + // Check for paired NaNs + if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL + && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL + && t2[j] == q2[j]) + continue; // if( t[j] != q[j] || t2[j] != q2[j] ) { - double test = ((double*) q)[j]; + double test = ((double *)q)[j]; int correct2 = INT_MIN; - long double correct = f->dfunc.f_ffpI( s[j], s2[j], &correct2 ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); + long double correct = + f->dfunc.f_ffpI(s[j], s2[j], &correct2); + float err = Bruteforce_Ulp_Error_Double(test, correct); int64_t iErr; - // in case of remquo, we only care about the sign and last seven bits of - // integer as per the spec. - if(testingRemquo) - iErr = (long long) (q2[j] & 0x0000007f) - (long long) (correct2 & 0x0000007f); + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); else - iErr = (long long) q2[j] - (long long) correct2; + iErr = (long long)q2[j] - (long long)correct2; - //For remquo, if y = 0, x is infinite, or either is NaN then the standard either neglects - //to say what is returned in iptr or leaves it undefined or implementation defined. - int iptrUndefined = fabs(((double*) gIn)[j]) == INFINITY || - ((double*) gIn2)[j] == 0.0 || - isnan(((double*) gIn2)[j]) || - isnan(((double*) gIn)[j]); - if(iptrUndefined) - iErr = 0; + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY + || ((double *)gIn2)[j] == 0.0 + || isnan(((double *)gIn2)[j]) + || isnan(((double *)gIn)[j]); + if (iptrUndefined) iErr = 0; - int fail = ! (fabsf(err) <= f->double_ulps && iErr == 0 ); - if( ftz && fail ) + int fail = !(fabsf(err) <= f->double_ulps && iErr == 0); + if (ftz && fail) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct, f->double_ulps)) { - fail = fail && ! ( test == 0.0f && iErr == 0 ); - if( ! fail ) - err = 0.0f; + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsDoubleSubnormal( s[j] ) ) + if (IsDoubleSubnormal(s[j])) { int correct3i, correct4i; - long double correct3 = f->dfunc.f_ffpI( 0.0, s2[j], &correct3i ); - long double correct4 = f->dfunc.f_ffpI( -0.0, s2[j], &correct4i ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct4 ); - int64_t iErr3 = (long long) q2[j] - (long long) correct3i; - int64_t iErr4 = (long long) q2[j] - (long long) correct4i; - fail = fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps && iErr4 == 0))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( llabs(iErr3) < llabs( iErr ) ) - iErr = iErr3; - if( llabs(iErr4) < llabs( iErr ) ) - iErr = iErr4; + long double correct3 = + f->dfunc.f_ffpI(0.0, s2[j], &correct3i); + long double correct4 = + f->dfunc.f_ffpI(-0.0, s2[j], &correct4i); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct4); + int64_t iErr3 = + (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = + (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= f->double_ulps + && iErr3 == 0)) + && (!(fabsf(err3) <= f->double_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) { - fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) ); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; } - //try with both args as zero - if( IsDoubleSubnormal( s2[j] ) ) + // try with both args as zero + if (IsDoubleSubnormal(s2[j])) { int correct7i, correct8i; - correct3 = f->dfunc.f_ffpI( 0.0, 0.0, &correct3i ); - correct4 = f->dfunc.f_ffpI( -0.0, 0.0, &correct4i ); - long double correct7 = f->dfunc.f_ffpI( 0.0, -0.0, &correct7i ); - long double correct8 = f->dfunc.f_ffpI( -0.0, -0.0, &correct8i ); - err2 = Bruteforce_Ulp_Error_Double( test, correct3 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct7 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct8 ); - iErr3 = (long long) q2[j] - (long long) correct3i; - iErr4 = (long long) q2[j] - (long long) correct4i; - int64_t iErr7 = (long long) q2[j] - (long long) correct7i; - int64_t iErr8 = (long long) q2[j] - (long long) correct8i; - fail = fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps && iErr4 == 0)) && - (!(fabsf(err4) <= f->double_ulps && iErr7 == 0)) && (!(fabsf(err5) <= f->double_ulps && iErr8 == 0))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - if( llabs(iErr3) < llabs( iErr ) ) - iErr = iErr3; - if( llabs(iErr4) < llabs( iErr ) ) - iErr = iErr4; - if( llabs(iErr7) < llabs( iErr ) ) - iErr = iErr7; - if( llabs(iErr8) < llabs( iErr ) ) - iErr = iErr8; + correct3 = + f->dfunc.f_ffpI(0.0, 0.0, &correct3i); + correct4 = + f->dfunc.f_ffpI(-0.0, 0.0, &correct4i); + long double correct7 = + f->dfunc.f_ffpI(0.0, -0.0, &correct7i); + long double correct8 = + f->dfunc.f_ffpI(-0.0, -0.0, &correct8i); + err2 = + Bruteforce_Ulp_Error_Double(test, correct3); + err3 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct7); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= f->double_ulps + && iErr3 == 0)) + && (!(fabsf(err3) <= f->double_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= f->double_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= f->double_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct3, f->double_ulps ) || IsDoubleResultSubnormal( correct4, f->double_ulps ) || - IsDoubleResultSubnormal( correct7, f->double_ulps ) || IsDoubleResultSubnormal( correct8, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct7, + f->double_ulps) + || IsDoubleResultSubnormal(correct8, + f->double_ulps)) { - fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0 || iErr7 == 0 || iErr8 == 0)); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; } } } - else if( IsDoubleSubnormal( s2[j] ) ) + else if (IsDoubleSubnormal(s2[j])) { int correct3i, correct4i; - long double correct3 = f->dfunc.f_ffpI( s[j], 0.0, &correct3i ); - long double correct4 = f->dfunc.f_ffpI( s[j], -0.0, &correct4i ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct4 ); - int64_t iErr3 = (long long) q2[j] - (long long) correct3i; - int64_t iErr4 = (long long) q2[j] - (long long) correct4i; - fail = fail && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) && (!(fabsf(err3) <= f->double_ulps && iErr4 == 0))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( llabs(iErr3) < llabs( iErr ) ) - iErr = iErr3; - if( llabs(iErr4) < llabs( iErr ) ) - iErr = iErr4; + long double correct3 = + f->dfunc.f_ffpI(s[j], 0.0, &correct3i); + long double correct4 = + f->dfunc.f_ffpI(s[j], -0.0, &correct4i); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct4); + int64_t iErr3 = + (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = + (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= f->double_ulps + && iErr3 == 0)) + && (!(fabsf(err3) <= f->double_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) { - fail = fail && ! ( test == 0.0f && (iErr3 == 0 || iErr4 == 0) ); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; } - if( llabs(iErr) > maxError2 ) + if (llabs(iErr) > maxError2) { - maxError2 = llabs(iErr ); + maxError2 = llabs(iErr); maxErrorVal2 = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, %.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, %d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ 0x%16.16llx, 0x%8.8x})\n", - f->name, sizeNames[k], err, iErr, - ((double*) gIn)[j], ((double*) gIn2)[j], - ((cl_ulong*) gIn)[j], ((cl_ulong*) gIn2)[j], - ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], - ((cl_ulong*) gOut_Ref)[j], ((cl_uint*) gOut_Ref2)[j], - test, q2[j], - ((cl_ulong*) q)[j], ((cl_uint*) q2)[j]); - error = -1; - goto exit; + vlog_error( + "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, " + "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, " + "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ " + "0x%16.16llx, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, + ((double *)gIn)[j], ((double *)gIn2)[j], + ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j], + ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j], + ((cl_ulong *)gOut_Ref)[j], + ((cl_uint *)gOut_Ref2)[j], test, q2[j], + ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]); + error = -1; + goto exit; } } } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } - fflush(stdout); + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array double *p = (double *)gIn; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - p[j] = DoubleFromUInt32( genrand_int32(d) ); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, bufferSize, gIn, 0, NULL, NULL) )) + for (j = 0; j < bufferSize / sizeof(double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -1129,6 +1365,3 @@ exit: return error; } - - - diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp index f6bd1223..7e207379 100644 --- a/test_conformance/math_brute_force/i_unary.cpp +++ b/test_conformance/math_brute_force/i_unary.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -33,60 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = ", name, "( f0 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " int3 i0 = ", name, "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int* out, __global float* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = ", + name, + "( f0 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " int3 i0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } @@ -95,88 +112,109 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - - const char *c3[] = {"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global double* in)\n" + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in)\n" "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 f0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = ", name, "( f0 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " int3 i0 = ", name, "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int* out, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 f0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = ", + name, + "( f0 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " int3 i0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); @@ -187,12 +225,12 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ; - size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(float), bufferSize); - int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1); logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); @@ -206,191 +244,226 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) return error; -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, + programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - if( gWimpyMode ) + if (gWimpyMode) { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - p[j] = (uint32_t) i + j * scale; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j * scale; } else { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - p[j] = (uint32_t) i + j; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result int *r = (int *)gOut_Ref; float *s = (float *)gIn; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - r[j] = f->func.i_f( s[j] ); + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = f->func.i_f(s[j]); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint32_t *t = (uint32_t *)gOut_Ref; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = (uint32_t *)(gOut[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - if( ftz && IsFloatSubnormal(s[j])) + if (ftz && IsFloatSubnormal(s[j])) { - unsigned int correct0 = f->func.i_f( 0.0 ); - unsigned int correct1 = f->func.i_f( -0.0 ); - if( q[j] == correct0 || q[j] == correct1 ) - continue; + unsigned int correct0 = f->func.i_f(0.0); + unsigned int correct1 = f->func.i_f(-0.0); + if (q[j] == correct0 || q[j] == correct1) continue; } uint32_t err = t[j] - q[j]; - if( q[j] > t[j] ) - err = q[j] - t[j]; - vlog_error( "\nERROR: %s%s: %d ulp error at %a (0x%8.8x): *%d vs. %d\n", f->name, sizeNames[k], err, ((float*) gIn)[j], ((cl_uint*) gIn)[j], t[j], q[j] ); - error = -1; - goto exit; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): " + "*%d vs. %d\n", + f->name, sizeNames[k], err, ((float *)gIn)[j], + ((cl_uint *)gIn)[j], t[j], q[j]); + error = -1; + goto exit; } } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) p[j] = genrand_int32(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - vlog( "\n" ); + vlog("\n"); exit: RestoreFPState(&oldMode); // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -404,12 +477,12 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; int ftz = f->ftz || gForceFTZ; - size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_double), bufferSize); - int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1); logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); @@ -423,200 +496,231 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) { return error; } -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + + i, programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array double *p = (double *)gIn; - if( gWimpyMode ) + if (gWimpyMode) { - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) - p[j] = DoubleFromUInt32( (uint32_t) i + j * scale ); + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j * scale); } else { - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) - p[j] = DoubleFromUInt32( (uint32_t) i + j ); + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result int *r = (int *)gOut_Ref; double *s = (double *)gIn; - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) - r[j] = f->dfunc.i_f( s[j] ); + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + r[j] = f->dfunc.i_f(s[j]); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint32_t *t = (uint32_t *)gOut_Ref; - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) + for (j = 0; j < bufferSize / sizeof(cl_double); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = (uint32_t *)(gOut[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - if( ftz && IsDoubleSubnormal(s[j])) + if (ftz && IsDoubleSubnormal(s[j])) { - unsigned int correct0 = f->dfunc.i_f( 0.0 ); - unsigned int correct1 = f->dfunc.i_f( -0.0 ); - if( q[j] == correct0 || q[j] == correct1 ) - continue; + unsigned int correct0 = f->dfunc.i_f(0.0); + unsigned int correct1 = f->dfunc.i_f(-0.0); + if (q[j] == correct0 || q[j] == correct1) continue; } uint32_t err = t[j] - q[j]; - if( q[j] > t[j] ) - err = q[j] - t[j]; - vlog_error( "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", f->name, sizeNames[k], err, ((double*) gIn)[j], t[j], q[j] ); - error = -1; - goto exit; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error( + "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", + f->name, sizeNames[k], err, ((double *)gIn)[j], t[j], + q[j]); + error = -1; + goto exit; } } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); } - fflush(stdout); - + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array double *p = (double *)gIn; - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) - p[j] = DoubleFromUInt32( genrand_int32(d) ); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - vlog( "\n" ); + vlog("\n"); exit: RestoreFPState(&oldMode); // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -624,4 +728,3 @@ exit: return error; } - diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp index 1cde215c..52c4e96c 100644 --- a/test_conformance/math_brute_force/macro_binary.cpp +++ b/test_conformance/math_brute_force/macro_binary.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -24,7 +24,8 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata, bool relaxedMode); extern const vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float, TestMacro_Int_Double_Double }; -static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p ); +static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p); static int BuildKernelDouble(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode); @@ -32,26 +33,42 @@ static int BuildKernelDouble(const char *name, int vectorSize, static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i] );\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in, __global float* in2)\n" + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int* out, __global float* in, __global float* in2)\n" "{\n" " size_t i = get_global_id(0);\n" " if( i + 1 < get_global_size(0) )\n" " {\n" " float3 f0 = vload3( 0, in + 3 * i );\n" " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " int3 i0 = ", name, "( f0, f1 );\n" + " int3 i0 = ", + name, + "( f0, f1 );\n" " vstore3( i0, 0, out + 3*i );\n" " }\n" " else\n" " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" " float3 f0, f1;\n" " switch( parity )\n" " {\n" @@ -64,7 +81,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" " break;\n" " }\n" - " int3 i0 = ", name, "( f0, f1 );\n" + " int3 i0 = ", + name, + "( f0, f1 );\n" " switch( parity )\n" " {\n" " case 0:\n" @@ -80,16 +99,17 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -101,27 +121,43 @@ static int BuildKernelDouble(const char *name, int vectorSize, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i] );\n" - "}\n" - }; + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global long", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in, __global double* in2)\n" + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global long* out, __global double* in, __global double* in2)\n" "{\n" " size_t i = get_global_id(0);\n" " if( i + 1 < get_global_size(0) )\n" " {\n" " double3 f0 = vload3( 0, in + 3 * i );\n" " double3 f1 = vload3( 0, in2 + 3 * i );\n" - " long3 l0 = ", name, "( f0, f1 );\n" + " long3 l0 = ", + name, + "( f0, f1 );\n" " vstore3( l0, 0, out + 3*i );\n" " }\n" " else\n" " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" " double3 f0, f1;\n" " switch( parity )\n" " {\n" @@ -134,7 +170,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, " f1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" " break;\n" " }\n" - " long3 l0 = ", name, "( f0, f1 );\n" + " long3 l0 = ", + name, + "( f0, f1 );\n" " switch( parity )\n" " {\n" " case 0:\n" @@ -149,17 +187,18 @@ static int BuildKernelDouble(const char *name, int vectorSize, }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -167,27 +206,31 @@ static int BuildKernelDouble(const char *name, int vectorSize, typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, @@ -197,72 +240,165 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo // A table of more difficult cases to get right static const float specialValuesFloat[] = { - -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), - MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f, -4.0f, -3.5f, - -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), - MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), - MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), - MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f, + -NAN, + -INFINITY, + -FLT_MAX, + MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), + MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), + MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), + MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), + MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), + MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), + MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), + MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), + MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), + MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), + MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), + MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), + -1000.f, + -100.f, + -4.0f, + -3.5f, + -3.0f, + MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), + -2.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), + -2.0f, + MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), + -1.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), + MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), + -1.0f, + MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), + MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), + -0.5f, + MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), + MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), + -0.25f, + MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), + MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), + -FLT_MIN, + MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), + MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), + MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), + MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), + MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), + MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), + MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), + MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), + MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), + MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), + -0.0f, - +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), - MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f, - +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), - MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), - MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), - MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f + +NAN, + +INFINITY, + +FLT_MAX, + MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), + MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), + MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), + MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), + MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), + MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), + MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), + MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), + MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), + MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), + MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), + MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), + +1000.f, + +100.f, + +4.0f, + +3.5f, + +3.0f, + MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), + 2.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), + +2.0f, + MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), + 1.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), + MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), + +1.0f, + MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), + MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), + +0.5f, + MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), + MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), + +0.25f, + MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), + MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), + +FLT_MIN, + MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), + MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), + MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), + MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), + MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), + MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), + MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), + MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), + MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), + MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), + +0.0f }; -static const size_t specialValuesFloatCount = sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); +static const size_t specialValuesFloatCount = + sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); -//Thread specific data for a worker thread +// Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -}ThreadInfo; + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; typedef struct TestInfo { - size_t subBufferSize; // Size of the sub-buffer in elements - const Func *f; // A pointer to the function info - cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes - cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id] - ThreadInfo *tinfo; // An array of thread specific information for each worker thread - cl_uint threadCount; // Number of worker threads - cl_uint jobCount; // Number of jobs - cl_uint step; // step between each chunk and the next. - cl_uint scale; // stride between individual test values - int ftz; // non-zero if running in flush to zero mode + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode -}TestInfo; +} TestInfo; -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; + TestInfo test_info; + cl_int error; + size_t i, j; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_float)); if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -271,58 +407,79 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode) } test_info.f = f; - test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } @@ -335,393 +492,446 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } // Run the kernels - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - vlog( "\n" ); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_float ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - fptr func = job->f->func; - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_int *t,*r; - cl_float *s,*s2; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_int *t, *r; + cl_float *s, *s2; // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_int *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_int *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_int *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount; + int totalSpecialValueCount = + specialValuesFloatCount * specialValuesFloatCount; int indx = (totalSpecialValueCount - 1) / buffer_elements; - if( job_id <= (cl_uint)indx ) + if (job_id <= (cl_uint)indx) { // test edge cases float *fp = (float *)p; float *fp2 = (float *)p2; uint32_t x, y; - x = (job_id * buffer_elements) % specialValuesFloatCount; - y = (job_id * buffer_elements) / specialValuesFloatCount; + x = (job_id * buffer_elements) % specialValuesFloatCount; + y = (job_id * buffer_elements) / specialValuesFloatCount; - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { fp[j] = specialValuesFloat[x]; fp2[j] = specialValuesFloat[y]; - if( ++x >= specialValuesFloatCount ) + if (++x >= specialValuesFloatCount) { x = 0; y++; - if( y >= specialValuesFloatCount ) - break; + if (y >= specialValuesFloatCount) break; } } } - //Init any remaining values. - for( ; j < buffer_elements; j++ ) + // Init any remaining values. + for (; j < buffer_elements; j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result - r = (cl_int *)gOut_Ref + thread_id * buffer_elements; - s = (float *)gIn + thread_id * buffer_elements; - s2 = (float *)gIn2 + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - r[j] = func.i_ff( s[j], s2[j] ); + // Calculate the correctly rounded reference result + r = (cl_int *)gOut_Ref + thread_id * buffer_elements; + s = (float *)gIn + thread_id * buffer_elements; + s2 = (float *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_int *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - //Verify data + // Verify data t = (cl_int *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { cl_int *q = out[0]; - if( gMinVectorSizeIndex == 0 && t[j] != q[j] ) + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { - if( ftz ) + if (ftz) { - if( IsFloatSubnormal( s[j]) ) + if (IsFloatSubnormal(s[j])) { - if( IsFloatSubnormal( s2[j] ) ) + if (IsFloatSubnormal(s2[j])) { - int correct = func.i_ff( 0.0f, 0.0f ); - int correct2 = func.i_ff( 0.0f, -0.0f ); - int correct3 = func.i_ff( -0.0f, 0.0f ); - int correct4 = func.i_ff( -0.0f, -0.0f ); + int correct = func.i_ff(0.0f, 0.0f); + int correct2 = func.i_ff(0.0f, -0.0f); + int correct3 = func.i_ff(-0.0f, 0.0f); + int correct4 = func.i_ff(-0.0f, -0.0f); - if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] ) + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) continue; } else { - int correct = func.i_ff( 0.0f, s2[j] ); - int correct2 = func.i_ff( -0.0f, s2[j] ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int correct = func.i_ff(0.0f, s2[j]); + int correct2 = func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; } } - else if( IsFloatSubnormal( s2[j] ) ) + else if (IsFloatSubnormal(s2[j])) { - int correct = func.i_ff( s[j], 0.0f ); - int correct2 = func.i_ff( s[j], -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int correct = func.i_ff(s[j], 0.0f); + int correct2 = func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } - } uint32_t err = t[j] - q[j]; - if( q[j] > t[j] ) - err = q[j] - t[j]; - vlog_error( "\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. 0x%8.8x (index: %d)\n", name, err, ((float*) s)[j], ((float*) s2)[j], t[j], q[j], j ); + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. " + "0x%8.8x (index: %d)\n", + name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j], + j); error = -1; goto exit; } - for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ ) + for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) { q = out[k]; // If we aren't getting the correctly rounded result - if( -t[j] != q[j] ) + if (-t[j] != q[j]) { - if( ftz ) + if (ftz) { - if( IsFloatSubnormal( s[j]) ) + if (IsFloatSubnormal(s[j])) { - if( IsFloatSubnormal( s2[j] ) ) + if (IsFloatSubnormal(s2[j])) { - int correct = -func.i_ff( 0.0f, 0.0f ); - int correct2 = -func.i_ff( 0.0f, -0.0f ); - int correct3 = -func.i_ff( -0.0f, 0.0f ); - int correct4 = -func.i_ff( -0.0f, -0.0f ); + int correct = -func.i_ff(0.0f, 0.0f); + int correct2 = -func.i_ff(0.0f, -0.0f); + int correct3 = -func.i_ff(-0.0f, 0.0f); + int correct4 = -func.i_ff(-0.0f, -0.0f); - if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] ) + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) continue; } else { - int correct = -func.i_ff( 0.0f, s2[j] ); - int correct2 = -func.i_ff( -0.0f, s2[j] ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int correct = -func.i_ff(0.0f, s2[j]); + int correct2 = -func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; } } - else if( IsFloatSubnormal( s2[j] ) ) + else if (IsFloatSubnormal(s2[j])) { - int correct = -func.i_ff( s[j], 0.0f ); - int correct2 = -func.i_ff( s[j], -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int correct = -func.i_ff(s[j], 0.0f); + int correct2 = -func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } - } cl_uint err = -t[j] - q[j]; - if( q[j] > -t[j] ) - err = q[j] + t[j]; - vlog_error( "\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x vs. 0x%8.8x (index: %d)\n", name, sizeNames[k], err, ((float*) s)[j], ((float*) s2)[j], -t[j], q[j], j ); + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x " + "vs. 0x%8.8x (index: %d)\n", + name, sizeNames[k], err, ((float *)s)[j], + ((float *)s2)[j], -t[j], q[j], j); error = -1; goto exit; } } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); } exit: @@ -731,50 +941,146 @@ exit: // A table of more difficult cases to get right static const double specialValuesDouble[] = { - -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100., -4.0, -3.5, - -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0, + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), + MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), + -1000., + -100., + -4.0, + -3.5, + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), + -0.5, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), + -0.25, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, - +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100., +4.0, +3.5, - +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0, + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), + MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), + +1000., + +100., + +4.0, + +3.5, + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), + +0.5, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), + +0.25, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, }; -static size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] ); +static size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; + TestInfo test_info; + cl_int error; + size_t i, j; logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_double)); if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -785,58 +1091,79 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode) test_info.f = f; test_info.ftz = f->ftz || gForceFTZ; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - test_info.tinfo[i].inBuf2 = clCreateSubBuffer( gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */ - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + /* Qualcomm fix: 9461 read-write flags must be compatible with + * parent buffer */ + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); /* Qualcomm fix: end */ - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } @@ -850,402 +1177,455 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input arrays + // Init input arrays uint64_t *p = (uint64_t *)gIn; uint64_t *p2 = (uint64_t *)gIn2; - for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) { - p[j] = (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32); - p2[j] = (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32); + p[j] = + (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32); + p2[j] = + (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, BUFFER_SIZE, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(test_info.programs[j]); goto exit; } + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - vlog( "\n" ); + vlog("\n"); exit: // Release - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { free_mtdata(test_info.tinfo[i].d); clReleaseMemObject(test_info.tinfo[i].inBuf); clReleaseMemObject(test_info.tinfo[i].inBuf2); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_double ); - cl_uint base = job_id * (cl_uint) job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - dptr dfunc = job->f->dfunc; - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_long *t,*r; - cl_double *s,*s2; + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + dptr dfunc = job->f->dfunc; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_long *t, *r; + cl_double *s, *s2; Force64BitFPUPrecision(); // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_long *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_long *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_long *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - //Init input array + // Init input array double *p = (double *)gIn + thread_id * buffer_elements; double *p2 = (double *)gIn2 + thread_id * buffer_elements; j = 0; - int totalSpecialValueCount = specialValuesDoubleCount * specialValuesDoubleCount; + int totalSpecialValueCount = + specialValuesDoubleCount * specialValuesDoubleCount; int indx = (totalSpecialValueCount - 1) / buffer_elements; - if( job_id <= (cl_uint)indx ) + if (job_id <= (cl_uint)indx) { // test edge cases uint32_t x, y; - x = (job_id * buffer_elements) % specialValuesDoubleCount; - y = (job_id * buffer_elements) / specialValuesDoubleCount; + x = (job_id * buffer_elements) % specialValuesDoubleCount; + y = (job_id * buffer_elements) / specialValuesDoubleCount; - for( ; j < buffer_elements; j++ ) + for (; j < buffer_elements; j++) { p[j] = specialValuesDouble[x]; p2[j] = specialValuesDouble[y]; - if( ++x >= specialValuesDoubleCount ) + if (++x >= specialValuesDoubleCount) { x = 0; y++; - if( y >= specialValuesDoubleCount ) - break; + if (y >= specialValuesDoubleCount) break; } } } - //Init any remaining values. - for( ; j < buffer_elements; j++ ) + // Init any remaining values. + for (; j < buffer_elements; j++) { - ((cl_ulong*)p)[j] = genrand_int64(d); - ((cl_ulong*)p2)[j] = genrand_int64(d); + ((cl_ulong *)p)[j] = genrand_int64(d); + ((cl_ulong *)p2)[j] = genrand_int64(d); } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); goto exit; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); goto exit; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); goto exit; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 2, sizeof( tinfo->inBuf2 ), &tinfo->inBuf2 ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result - r = (cl_long *)gOut_Ref + thread_id * buffer_elements; - s = (cl_double *)gIn + thread_id * buffer_elements; - s2 = (cl_double *)gIn2 + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - r[j] = dfunc.i_ff( s[j], s2[j] ); + // Calculate the correctly rounded reference result + r = (cl_long *)gOut_Ref + thread_id * buffer_elements; + s = (cl_double *)gIn + thread_id * buffer_elements; + s2 = (cl_double *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_ff(s[j], s2[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_long *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); goto exit; } } // Wait for the last buffer - out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); goto exit; } - //Verify data + // Verify data t = (cl_long *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - cl_long *q = (cl_long *) out[0]; + cl_long *q = (cl_long *)out[0]; // If we aren't getting the correctly rounded result - if( gMinVectorSizeIndex == 0 && t[j] != q[j] ) + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { - if( ftz ) + if (ftz) { - if( IsDoubleSubnormal( s[j]) ) + if (IsDoubleSubnormal(s[j])) { - if( IsDoubleSubnormal( s2[j] ) ) + if (IsDoubleSubnormal(s2[j])) { - int64_t correct = dfunc.i_ff( 0.0f, 0.0f ); - int64_t correct2 = dfunc.i_ff( 0.0f, -0.0f ); - int64_t correct3 = dfunc.i_ff( -0.0f, 0.0f ); - int64_t correct4 = dfunc.i_ff( -0.0f, -0.0f ); + int64_t correct = dfunc.i_ff(0.0f, 0.0f); + int64_t correct2 = dfunc.i_ff(0.0f, -0.0f); + int64_t correct3 = dfunc.i_ff(-0.0f, 0.0f); + int64_t correct4 = dfunc.i_ff(-0.0f, -0.0f); - if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] ) + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) continue; } else { - int64_t correct = dfunc.i_ff( 0.0f, s2[j] ); - int64_t correct2 = dfunc.i_ff( -0.0f, s2[j] ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int64_t correct = dfunc.i_ff(0.0f, s2[j]); + int64_t correct2 = dfunc.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; } } - else if( IsDoubleSubnormal( s2[j] ) ) + else if (IsDoubleSubnormal(s2[j])) { - int64_t correct = dfunc.i_ff( s[j], 0.0f ); - int64_t correct2 = dfunc.i_ff( s[j], -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int64_t correct = dfunc.i_ff(s[j], 0.0f); + int64_t correct2 = dfunc.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } - } uint64_t err = t[j] - q[j]; - if( q[j] > t[j] ) - err = q[j] - t[j]; - vlog_error( "\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld vs. %lld (index: %d)\n", name, err, ((double*) s)[j], ((double*) s2)[j], t[j], q[j], j ); + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld " + "vs. %lld (index: %d)\n", + name, err, ((double *)s)[j], ((double *)s2)[j], t[j], + q[j], j); error = -1; goto exit; } - for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ ) + for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) { - q = (cl_long*) out[k]; + q = (cl_long *)out[k]; // If we aren't getting the correctly rounded result - if( -t[j] != q[j] ) + if (-t[j] != q[j]) { - if( ftz ) + if (ftz) { - if( IsDoubleSubnormal( s[j]) ) + if (IsDoubleSubnormal(s[j])) { - if( IsDoubleSubnormal( s2[j] ) ) + if (IsDoubleSubnormal(s2[j])) { - int64_t correct = -dfunc.i_ff( 0.0f, 0.0f ); - int64_t correct2 = -dfunc.i_ff( 0.0f, -0.0f ); - int64_t correct3 = -dfunc.i_ff( -0.0f, 0.0f ); - int64_t correct4 = -dfunc.i_ff( -0.0f, -0.0f ); + int64_t correct = -dfunc.i_ff(0.0f, 0.0f); + int64_t correct2 = -dfunc.i_ff(0.0f, -0.0f); + int64_t correct3 = -dfunc.i_ff(-0.0f, 0.0f); + int64_t correct4 = -dfunc.i_ff(-0.0f, -0.0f); - if( correct == q[j] || correct2 == q[j] || correct3 == q[j] || correct4 == q[j] ) + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) continue; } else { - int64_t correct = -dfunc.i_ff( 0.0f, s2[j] ); - int64_t correct2 = -dfunc.i_ff( -0.0f, s2[j] ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int64_t correct = -dfunc.i_ff(0.0f, s2[j]); + int64_t correct2 = -dfunc.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; } } - else if( IsDoubleSubnormal( s2[j] ) ) + else if (IsDoubleSubnormal(s2[j])) { - int64_t correct = -dfunc.i_ff( s[j], 0.0f ); - int64_t correct2 = -dfunc.i_ff( s[j], -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int64_t correct = -dfunc.i_ff(s[j], 0.0f); + int64_t correct2 = -dfunc.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } - } uint64_t err = -t[j] - q[j]; - if( q[j] > -t[j] ) - err = q[j] + t[j]; - vlog_error( "\nERROR: %sD%s: %lld ulp error at {%.13la, %.13la}: *%lld vs. %lld (index: %d)\n", name, sizeNames[k], err, ((double*) s)[j], ((double*) s2)[j], -t[j], q[j], j ); + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %sD%s: %lld ulp error at {%.13la, " + "%.13la}: *%lld vs. %lld (index: %d)\n", + name, sizeNames[k], err, ((double *)s)[j], + ((double *)s2)[j], -t[j], q[j], j); error = -1; goto exit; } } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); } exit: return error; } - diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp index 70f724ce..26a186f6 100644 --- a/test_conformance/math_brute_force/macro_unary.cpp +++ b/test_conformance/math_brute_force/macro_unary.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -33,60 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = ", name, "( f0 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " int3 i0;\n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " i0 = ", name, "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int* out, __global float* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = ", + name, + "( f0 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " int3 i0;\n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " i0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -97,62 +114,79 @@ static int BuildKernelDouble(const char *name, int vectorSize, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in)\n" + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global long", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in)\n" "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " long3 l0 = ", name, "( d0 );\n" - " vstore3( l0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 d0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " long3 l0 = ", name, "( d0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = l0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = l0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global long* out, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " long3 l0 = ", + name, + "( d0 );\n" + " vstore3( l0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " long3 l0 = ", + name, + "( d0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = l0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = l0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -160,80 +194,90 @@ static int BuildKernelDouble(const char *name, int vectorSize, typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -//Thread specific data for a worker thread +// Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread - cl_command_queue tQueue; // per thread command queue to improve performance -}ThreadInfo; + cl_mem inBuf; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; typedef struct TestInfo { - size_t subBufferSize; // Size of the sub-buffer in elements - const Func *f; // A pointer to the function info - cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes - cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id] - ThreadInfo *tinfo; // An array of thread specific information for each worker thread - cl_uint threadCount; // Number of worker threads - cl_uint jobCount; // Number of jobs - cl_uint step; // step between each chunk and the next. - cl_uint scale; // stride between individual test values - int ftz; // non-zero if running in flush to zero mode + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode -}TestInfo; +} TestInfo; -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; + TestInfo test_info; + cl_int error; + size_t i, j; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_float)); - if (gWimpyMode ) + if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -242,51 +286,68 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode) } test_info.f = f; - test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gOutBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } } @@ -297,281 +358,315 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array cl_uint *p = (cl_uint *)gIn; - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) p[j] = genrand_int32(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - vlog( "\n" ); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { clReleaseMemObject(test_info.tinfo[i].inBuf); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_float ); + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint scale = job->scale; - cl_uint base = job_id * (cl_uint) job->step; + cl_uint base = job_id * (cl_uint)job->step; ThreadInfo *tinfo = job->tinfo + thread_id; - fptr func = job->f->func; - int ftz = job->ftz; + fptr func = job->f->func; + int ftz = job->ftz; cl_uint j, k; cl_int error = CL_SUCCESS; - cl_int ret = CL_SUCCESS; + cl_int ret = CL_SUCCESS; const char *name = job->f->name; int signbit_test = 0; - if(!strcmp(name, "signbit")) - signbit_test = 1; + if (!strcmp(name, "signbit")) signbit_test = 1; - #define ref_func(s) ( signbit_test ? func.i_f_f( s ) : func.i_f( s ) ) +#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s)) // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_int *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_int *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_int *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); // Write the new values to the input array - cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - p[j] = base + j * scale; + cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale; - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); return error; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); return error; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); return error; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); return error; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements; float *s = (float *)p; - for( j = 0; j < buffer_elements; j++ ) - r[j] = ref_func( s[j] ); + for (j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_int *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Wait for the last buffer - out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); return error; } - //Verify data + // Verify data cl_int *t = (cl_int *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_int *q = out[0]; // If we aren't getting the correctly rounded result - if( gMinVectorSizeIndex == 0 && t[j] != q[j]) + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { // If we aren't getting the correctly rounded result - if( ftz ) + if (ftz) { - if( IsFloatSubnormal( s[j]) ) + if (IsFloatSubnormal(s[j])) { - int correct = ref_func( +0.0f ); - int correct2 = ref_func( -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int correct = ref_func(+0.0f); + int correct2 = ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } } uint32_t err = t[j] - q[j]; - if( q[j] > t[j] ) - err = q[j] - t[j]; - vlog_error( "\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", name, err, ((float*) s)[j], t[j], q[j] ); + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", + name, err, ((float *)s)[j], t[j], q[j]); error = -1; goto exit; } - for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ ) + for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) { q = out[k]; // If we aren't getting the correctly rounded result - if( -t[j] != q[j] ) + if (-t[j] != q[j]) { - if( ftz ) + if (ftz) { - if( IsFloatSubnormal( s[j])) + if (IsFloatSubnormal(s[j])) { - int correct = -ref_func( +0.0f ); - int correct2 = -ref_func( -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int correct = -ref_func(+0.0f); + int correct2 = -ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } } uint32_t err = -t[j] - q[j]; - if( q[j] > -t[j] ) - err = q[j] + t[j]; - vlog_error( "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, sizeNames[k], err, ((float*) s)[j], -t[j], q[j] ); - error = -1; - goto exit; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error( + "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, + sizeNames[k], err, ((float *)s)[j], -t[j], q[j]); + error = -1; + goto exit; } } } @@ -579,60 +674,69 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) exit: ret = error; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) + if ((error = clFlush(tinfo->tQueue))) { - vlog( "clFlush 3 failed\n" ); + vlog("clFlush 3 failed\n"); return error; } - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); } return ret; } -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ); +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data); int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; + TestInfo test_info; + cl_int error; + size_t i, j; logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_double)); - if (gWimpyMode ) + if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -643,52 +747,69 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode) test_info.f = f; test_info.ftz = f->ftz || gForceFTZ; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */ - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + /* Qualcomm fix: 9461 read-write flags must be compatible with + * parent buffer */ + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); /* Qualcomm fix: end */ - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } } @@ -699,117 +820,131 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array cl_ulong *p = (cl_ulong *)gIn; - for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) p[j] = DoubleFromUInt32(genrand_int32(d)); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - vlog( "\n" ); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { clReleaseMemObject(test_info.tinfo[i].inBuf); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_double ); + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint scale = job->scale; - cl_uint base = job_id * (cl_uint) job->step; + cl_uint base = job_id * (cl_uint)job->step; ThreadInfo *tinfo = job->tinfo + thread_id; - dptr dfunc = job->f->dfunc; + dptr dfunc = job->f->dfunc; cl_uint j, k; cl_int error; int ftz = job->ftz; @@ -818,189 +953,209 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) Force64BitFPUPrecision(); // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_long *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_long *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_long *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); // Write the new values to the input array - cl_double *p = (cl_double*) gIn + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - p[j] = DoubleFromUInt32( base + j * scale); + cl_double *p = (cl_double *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + p[j] = DoubleFromUInt32(base + j * scale); - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); return error; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); return error; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); return error; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); return error; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements; cl_double *s = (cl_double *)p; - for( j = 0; j < buffer_elements; j++ ) - r[j] = dfunc.i_f( s[j] ); + for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_long *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Wait for the last buffer - out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); return error; } - //Verify data + // Verify data cl_long *t = (cl_long *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { cl_long *q = out[0]; // If we aren't getting the correctly rounded result - if( gMinVectorSizeIndex == 0 && t[j] != q[j]) + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { // If we aren't getting the correctly rounded result - if( ftz ) + if (ftz) { - if( IsDoubleSubnormal( s[j]) ) + if (IsDoubleSubnormal(s[j])) { - cl_long correct = dfunc.i_f( +0.0f ); - cl_long correct2 = dfunc.i_f( -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + cl_long correct = dfunc.i_f(+0.0f); + cl_long correct2 = dfunc.i_f(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } } cl_ulong err = t[j] - q[j]; - if( q[j] > t[j] ) - err = q[j] - t[j]; - vlog_error( "\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", name, err, ((double*) gIn)[j], t[j], q[j] ); + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", + name, err, ((double *)gIn)[j], t[j], q[j]); return -1; } - for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ ) + for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) { q = out[k]; // If we aren't getting the correctly rounded result - if( -t[j] != q[j] ) + if (-t[j] != q[j]) { - if( ftz ) + if (ftz) { - if( IsDoubleSubnormal( s[j])) + if (IsDoubleSubnormal(s[j])) { - int64_t correct = -dfunc.i_f( +0.0f ); - int64_t correct2 = -dfunc.i_f( -0.0f ); - if( correct == q[j] || correct2 == q[j] ) - continue; + int64_t correct = -dfunc.i_f(+0.0f); + int64_t correct2 = -dfunc.i_f(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; } } cl_ulong err = -t[j] - q[j]; - if( q[j] > -t[j] ) - err = q[j] + t[j]; - vlog_error( "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", name, sizeNames[k], err, ((double*) gIn)[j], -t[j], q[j] ); + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error( + "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", + name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]); return -1; } } - } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); } return CL_SUCCESS; } - - - - diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp index ed1d7d53..9292649a 100644 --- a/test_conformance/math_brute_force/mad.cpp +++ b/test_conformance/math_brute_force/mad.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -31,66 +31,87 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2, __global float", sizeNames[vectorSize], "* in3 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i], in3[i] );\n" - "}\n" - }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2, __global float* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " float3 f2 = vload3( 0, in3 + 3 * i );\n" - " f0 = ", name, "( f0, f1, f2 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 f0, f1, f2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " f2 = (float3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0, f1, f2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2, __global float", + sizeNames[vectorSize], + "* in3 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], in3[i] );\n" + "}\n" }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in, __global float* in2, " + "__global float* in3)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 f1 = vload3( 0, in2 + 3 * i );\n" + " float3 f2 = vload3( 0, in3 + 3 * i );\n" + " f0 = ", + name, + "( f0, f1, f2 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0, f1, f2;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " f1 = (float3)( in2[3*i], NAN, NAN ); \n" + " f2 = (float3)( in3[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" + " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, f1, f2 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } @@ -98,94 +119,119 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2, __global double", sizeNames[vectorSize], "* in3 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i], in3[i] );\n" - "}\n" - }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2, __global double* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " double3 d2 = vload3( 0, in3 + 3 * i );\n" - " d0 = ", name, "( d0, d1, d2 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 d0, d1, d2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " d2 = (double3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", name, "( d0, d1, d2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2, __global double", + sizeNames[vectorSize], + "* in3 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], in3[i] );\n" + "}\n" }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global double* in2, " + "__global double* in3)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " double3 d2 = vload3( 0, in3 + 3 * i );\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0, d1, d2;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " d2 = (double3)( in3[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); @@ -199,232 +245,356 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode) logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; -// int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + // int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & + // gFloatCapabilities); float maxErrorVal = 0.0f; float maxErrorVal2 = 0.0f; float maxErrorVal3 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(float), bufferSize); // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) return error; -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, + programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; uint32_t *p3 = (uint32_t *)gIn3; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); p3[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result float *r = (float *)gOut_Ref; float *s = (float *)gIn; float *s2 = (float *)gIn2; float *s3 = (float *)gIn3; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - r[j] = (float) f->func.f_fff( s[j], s2[j], s3[j] ); + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data -- Commented out on purpose. no verification possible. MAD is a random number generator. -/* - uint32_t *t = gOut_Ref; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) - { - uint32_t *q = gOut[k]; - - // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + // Verify data -- Commented out on purpose. no verification possible. + // MAD is a random number generator. + /* + uint32_t *t = gOut_Ref; + for( j = 0; j < bufferSize / sizeof( float ); j++ ) { - float test = ((float*) q)[j]; - double correct = f->func.f_fff( s[j], s2[j], s3[j] ); - float err = Ulp_Error( test, correct ); - int fail = ! (fabsf(err) <= f->float_ulps); - - if( fail && ftz ) + for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) { - // retry per section 6.5.3.2 - if( IsFloatSubnormal(correct) ) - { // look at me, - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; - } + uint32_t *q = gOut[k]; - // retry per section 6.5.3.3 - if( fail && IsFloatSubnormal( s[j] ) ) - { // look at me, - double correct2 = f->func.f_fff( 0.0, s2[j], s3[j] ); - double correct3 = f->func.f_fff( -0.0, s2[j], s3[j] ); - float err2 = Ulp_Error( test, correct2 ); - float err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + // If we aren't getting the correctly rounded result + if( t[j] != q[j] ) + { + float test = ((float*) q)[j]; + double correct = f->func.f_fff( s[j], s2[j], s3[j] + ); float err = Ulp_Error( test, correct ); int fail = ! (fabsf(err) <= + f->float_ulps); - // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ) - { // look at me now, - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - - //try with first two args as zero - if( IsFloatSubnormal( s2[j] ) ) - { // its fun to have fun, - correct2 = f->func.f_fff( 0.0, 0.0, s3[j] ); - correct3 = f->func.f_fff( -0.0, 0.0, s3[j] ); - double correct4 = f->func.f_fff( 0.0, -0.0, s3[j] ); - double correct5 = f->func.f_fff( -0.0, -0.0, s3[j] ); - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - float err4 = Ulp_Error( test, correct4 ); - float err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) && - (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - - // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || - IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) ) - { - fail = fail && ( test != 0.0f); + if( fail && ftz ) + { + // retry per section 6.5.3.2 + if( IsFloatSubnormal(correct) ) + { // look at me, + fail = fail && ( test != 0.0f ); if( ! fail ) err = 0.0f; } - if( IsFloatSubnormal( s3[j] ) ) - { // but you have to know how! - correct2 = f->func.f_fff( 0.0, 0.0, 0.0f ); - correct3 = f->func.f_fff( -0.0, 0.0, 0.0f ); - correct4 = f->func.f_fff( 0.0, -0.0, 0.0f ); - correct5 = f->func.f_fff( -0.0, -0.0, 0.0f ); - double correct6 = f->func.f_fff( 0.0, 0.0, -0.0f ); - double correct7 = f->func.f_fff( -0.0, 0.0, -0.0f ); - double correct8 = f->func.f_fff( 0.0, -0.0, -0.0f ); - double correct9 = f->func.f_fff( -0.0, -0.0, -0.0f ); - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - err4 = Ulp_Error( test, correct4 ); - err5 = Ulp_Error( test, correct5 ); - float err6 = Ulp_Error( test, correct6 ); - float err7 = Ulp_Error( test, correct7 ); - float err8 = Ulp_Error( test, correct8 ); - float err9 = Ulp_Error( test, correct9 ); - fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) && - (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)) && - (!(fabsf(err5) <= f->float_ulps)) && (!(fabsf(err6) <= f->float_ulps)) && - (!(fabsf(err7) <= f->float_ulps)) && (!(fabsf(err8) <= f->float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) + // retry per section 6.5.3.3 + if( fail && IsFloatSubnormal( s[j] ) ) + { // look at me, + double correct2 = f->func.f_fff( 0.0, s2[j], + s3[j] ); double correct3 = f->func.f_fff( -0.0, s2[j], s3[j] ); float + err2 = Ulp_Error( test, correct2 ); float err3 = Ulp_Error( test, + correct3 ); fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && + (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - if( fabsf( err6 ) < fabsf(err ) ) - err = err6; - if( fabsf( err7 ) < fabsf(err ) ) - err = err7; - if( fabsf( err8 ) < fabsf(err ) ) - err = err8; - if( fabsf( err9 ) < fabsf(err ) ) - err = err9; // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || - IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) || - IsFloatResultSubnormal( correct6, f->float_ulps ) || IsFloatResultSubnormal(correct7, f->float_ulps ) || - IsFloatResultSubnormal(correct8, f->float_ulps ) || IsFloatResultSubnormal( correct9, f->float_ulps ) ) + if( IsFloatResultSubnormal(correct2, + f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ) + { // look at me now, + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + + //try with first two args as zero + if( IsFloatSubnormal( s2[j] ) ) + { // its fun to have fun, + correct2 = f->func.f_fff( 0.0, 0.0, + s3[j] ); correct3 = f->func.f_fff( -0.0, 0.0, s3[j] ); double correct4 + = f->func.f_fff( 0.0, -0.0, s3[j] ); double correct5 = f->func.f_fff( + -0.0, -0.0, s3[j] ); err2 = Ulp_Error( test, correct2 ); err3 = + Ulp_Error( test, correct3 ); float err4 = Ulp_Error( test, correct4 ); + float err5 = Ulp_Error( test, correct5 + ); fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) + <= f->float_ulps)) && + (!(fabsf(err4) <= + f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2 + ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = + err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) < + fabsf(err ) ) err = err5; + + // retry per section 6.5.3.4 + if( IsFloatResultSubnormal(correct2, + f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || + IsFloatResultSubnormal(correct4, + f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + + if( IsFloatSubnormal( s3[j] ) ) + { // but you have to know how! + correct2 = f->func.f_fff( 0.0, 0.0, + 0.0f ); correct3 = f->func.f_fff( -0.0, 0.0, 0.0f ); correct4 = + f->func.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->func.f_fff( -0.0, -0.0, + 0.0f ); double correct6 = f->func.f_fff( 0.0, 0.0, -0.0f ); double + correct7 = f->func.f_fff( -0.0, 0.0, -0.0f ); double correct8 = + f->func.f_fff( 0.0, -0.0, -0.0f ); double correct9 = f->func.f_fff( + -0.0, -0.0, -0.0f ); err2 = Ulp_Error( test, correct2 ); err3 = + Ulp_Error( test, correct3 ); err4 = Ulp_Error( test, correct4 ); err5 + = Ulp_Error( test, correct5 ); float err6 = Ulp_Error( test, correct6 + ); float err7 = Ulp_Error( test, correct7 ); float err8 = Ulp_Error( + test, correct8 ); float err9 = Ulp_Error( test, correct9 ); fail = + fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= + f->float_ulps)) && + (!(fabsf(err4) <= + f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps)) && + (!(fabsf(err5) <= + f->float_ulps)) && (!(fabsf(err6) <= f->float_ulps)) && + (!(fabsf(err7) <= + f->float_ulps)) && (!(fabsf(err8) <= f->float_ulps))); if( fabsf( err2 + ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = + err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) < + fabsf(err ) ) err = err5; if( fabsf( err6 ) < fabsf(err ) ) err = err6; + if( fabsf( err7 ) < fabsf(err ) ) + err = err7; + if( fabsf( err8 ) < fabsf(err ) ) + err = err8; + if( fabsf( err9 ) < fabsf(err ) ) + err = err9; + + // retry per section 6.5.3.4 + if( IsFloatResultSubnormal(correct2, + f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || + IsFloatResultSubnormal(correct4, + f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) || + IsFloatResultSubnormal( + correct6, f->float_ulps ) || IsFloatResultSubnormal(correct7, + f->float_ulps ) || IsFloatResultSubnormal(correct8, f->float_ulps ) || + IsFloatResultSubnormal( correct9, f->float_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + } + } + else if( IsFloatSubnormal( s3[j] ) ) + { + correct2 = f->func.f_fff( 0.0, s2[j], + 0.0 ); correct3 = f->func.f_fff( -0.0, s2[j], 0.0 ); double correct4 = + f->func.f_fff( 0.0, s2[j], -0.0 ); double correct5 = f->func.f_fff( + -0.0, s2[j], -0.0 ); err2 = Ulp_Error( test, correct2 ); err3 = + Ulp_Error( test, correct3 ); float err4 = Ulp_Error( test, correct4 ); + float err5 = Ulp_Error( test, correct5 + ); fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) + <= f->float_ulps)) && + (!(fabsf(err4) <= + f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2 + ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = + err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) < + fabsf(err ) ) err = err5; + + // retry per section 6.5.3.4 + if( IsFloatResultSubnormal(correct2, + f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || + IsFloatResultSubnormal(correct4, + f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + } + } + else if( fail && IsFloatSubnormal( s2[j] ) ) + { + double correct2 = f->func.f_fff( s[j], 0.0, + s3[j] ); double correct3 = f->func.f_fff( s[j], -0.0, s3[j] ); float + err2 = Ulp_Error( test, correct2 ); float err3 = Ulp_Error( test, + correct3 ); fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && + (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 ) < fabsf(err ) ) + err = err2; + if( fabsf( err3 ) < fabsf(err ) ) + err = err3; + + // retry per section 6.5.3.4 + if( IsFloatResultSubnormal(correct2, + f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + + //try with second two args as zero + if( IsFloatSubnormal( s3[j] ) ) + { + correct2 = f->func.f_fff( s[j], 0.0, 0.0 + ); correct3 = f->func.f_fff( s[j], -0.0, 0.0 ); double correct4 = + f->func.f_fff( s[j], 0.0, -0.0 ); double correct5 = f->func.f_fff( + s[j], -0.0, -0.0 ); err2 = Ulp_Error( test, correct2 ); err3 = + Ulp_Error( test, correct3 ); float err4 = Ulp_Error( test, correct4 ); + float err5 = Ulp_Error( test, correct5 + ); fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) + <= f->float_ulps)) && + (!(fabsf(err4) <= + f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); if( fabsf( err2 + ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = + err3; if( fabsf( err4 ) < fabsf(err ) ) err = err4; if( fabsf( err5 ) < + fabsf(err ) ) err = err5; + + // retry per section 6.5.3.4 + if( IsFloatResultSubnormal(correct2, + f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || + IsFloatResultSubnormal(correct4, + f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + } + } + else if( fail && IsFloatSubnormal(s3[j]) ) + { + double correct2 = f->func.f_fff( s[j], + s2[j], 0.0 ); double correct3 = f->func.f_fff( s[j], s2[j], -0.0 ); + float err2 = Ulp_Error( test, correct2 ); + float err3 = Ulp_Error( test, correct3 ); + fail = fail && ((!(fabsf(err2) <= + f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps))); if( fabsf( err2 + ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = + err3; + + // retry per section 6.5.3.4 + if( IsFloatResultSubnormal(correct2, + f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ) { fail = fail && ( test != 0.0f); if( ! fail ) @@ -432,222 +602,146 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode) } } } - else if( IsFloatSubnormal( s3[j] ) ) - { - correct2 = f->func.f_fff( 0.0, s2[j], 0.0 ); - correct3 = f->func.f_fff( -0.0, s2[j], 0.0 ); - double correct4 = f->func.f_fff( 0.0, s2[j], -0.0 ); - double correct5 = f->func.f_fff( -0.0, s2[j], -0.0 ); - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - float err4 = Ulp_Error( test, correct4 ); - float err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) && - (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || - IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } + if( fabsf(err ) > maxError ) + { + maxError = fabsf(err); + maxErrorVal = s[j]; + maxErrorVal2 = s2[j]; + maxErrorVal3 = s3[j]; + } + + if( fail ) + { + vlog_error( "\nERROR: %s%s: %f ulp error at {%a, + %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], + ((float*) gOut_Ref)[j], test ); error = -1; goto exit; } } - else if( fail && IsFloatSubnormal( s2[j] ) ) - { - double correct2 = f->func.f_fff( s[j], 0.0, s3[j] ); - double correct3 = f->func.f_fff( s[j], -0.0, s3[j] ); - float err2 = Ulp_Error( test, correct2 ); - float err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - - // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - - //try with second two args as zero - if( IsFloatSubnormal( s3[j] ) ) - { - correct2 = f->func.f_fff( s[j], 0.0, 0.0 ); - correct3 = f->func.f_fff( s[j], -0.0, 0.0 ); - double correct4 = f->func.f_fff( s[j], 0.0, -0.0 ); - double correct5 = f->func.f_fff( s[j], -0.0, -0.0 ); - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - float err4 = Ulp_Error( test, correct4 ); - float err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps)) && - (!(fabsf(err4) <= f->float_ulps)) && (!(fabsf(err5) <= f->float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - - // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) || - IsFloatResultSubnormal(correct4, f->float_ulps ) || IsFloatResultSubnormal(correct5, f->float_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - } - } - else if( fail && IsFloatSubnormal(s3[j]) ) - { - double correct2 = f->func.f_fff( s[j], s2[j], 0.0 ); - double correct3 = f->func.f_fff( s[j], s2[j], -0.0 ); - float err2 = Ulp_Error( test, correct2 ); - float err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->float_ulps)) && (!(fabsf(err3) <= f->float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - - // retry per section 6.5.3.4 - if( IsFloatResultSubnormal(correct2, f->float_ulps ) || IsFloatResultSubnormal(correct3, f->float_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - } - } - - if( fabsf(err ) > maxError ) - { - maxError = fabsf(err); - maxErrorVal = s[j]; - maxErrorVal2 = s2[j]; - maxErrorVal3 = s3[j]; - } - - if( fail ) - { - vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((float*) gOut_Ref)[j], test ); - error = -1; - goto exit; } } - } - } -*/ - if( 0 == (i & 0x0fffffff) ) + */ + if (0 == (i & 0x0fffffff)) { - vlog("." ); + vlog("."); fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "pass" ); + vlog("pass"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; uint32_t *p3 = (uint32_t *)gIn3; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); p3[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -661,14 +755,14 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; -// int ftz = f->ftz || gForceFTZ; + // int ftz = f->ftz || gForceFTZ; double maxErrorVal = 0.0f; double maxErrorVal2 = 0.0f; double maxErrorVal3 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); uint64_t step = getTestStep(sizeof(double), bufferSize); @@ -676,223 +770,363 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) { return error; } -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + + i, programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array double *p = (double *)gIn; double *p2 = (double *)gIn2; double *p3 = (double *)gIn3; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = DoubleFromUInt32(genrand_int32(d)); p3[j] = DoubleFromUInt32(genrand_int32(d)); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result double *r = (double *)gOut_Ref; double *s = (double *)gIn; double *s2 = (double *)gIn2; double *s3 = (double *)gIn3; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - r[j] = (double) f->dfunc.f_fff( s[j], s2[j], s3[j] ); + for (j = 0; j < bufferSize / sizeof(double); j++) + r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data -- Commented out on purpose. no verification possible. MAD is a random number generator. -/* - uint64_t *t = gOut_Ref; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) - { - uint64_t *q = gOut[k]; - - // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + // Verify data -- Commented out on purpose. no verification possible. + // MAD is a random number generator. + /* + uint64_t *t = gOut_Ref; + for( j = 0; j < bufferSize / sizeof( double ); j++ ) { - double test = ((double*) q)[j]; - long double correct = f->dfunc.f_fff( s[j], s2[j], s3[j] ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - int fail = ! (fabsf(err) <= f->double_ulps); - - if( fail && ftz ) + for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) { - // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, f->double_ulps) ) - { // look at me, - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; - } + uint64_t *q = gOut[k]; - // retry per section 6.5.3.3 - if( fail && IsDoubleSubnormal( s[j] ) ) - { // look at me, - long double correct2 = f->dfunc.f_fff( 0.0, s2[j], s3[j] ); - long double correct3 = f->dfunc.f_fff( -0.0, s2[j], s3[j] ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + // If we aren't getting the correctly rounded result + if( t[j] != q[j] ) + { + double test = ((double*) q)[j]; + long double correct = f->dfunc.f_fff( s[j], s2[j], + s3[j] ); float err = Bruteforce_Ulp_Error_Double( test, correct ); int + fail = ! (fabsf(err) <= f->double_ulps); - // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) - { // look at me now, - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - - //try with first two args as zero - if( IsDoubleSubnormal( s2[j] ) ) - { // its fun to have fun, - correct2 = f->dfunc.f_fff( 0.0, 0.0, s3[j] ); - correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] ); - long double correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] ); - long double correct5 = f->dfunc.f_fff( -0.0, -0.0, s3[j] ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - - // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + if( fail && ftz ) + { + // retry per section 6.5.3.2 + if( IsDoubleResultSubnormal(correct, + f->double_ulps) ) { // look at me, fail = fail && ( test != 0.0f ); if( + ! fail ) err = 0.0f; } - if( IsDoubleSubnormal( s3[j] ) ) - { // but you have to know how! - correct2 = f->dfunc.f_fff( 0.0, 0.0, 0.0f ); - correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f ); - correct4 = f->dfunc.f_fff( 0.0, -0.0, 0.0f ); - correct5 = f->dfunc.f_fff( -0.0, -0.0, 0.0f ); - long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f ); - long double correct7 = f->dfunc.f_fff( -0.0, 0.0, -0.0f ); - long double correct8 = f->dfunc.f_fff( 0.0, -0.0, -0.0f ); - long double correct9 = f->dfunc.f_fff( -0.0, -0.0, -0.0f ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - float err6 = Bruteforce_Ulp_Error_Double( test, correct6 ); - float err7 = Bruteforce_Ulp_Error_Double( test, correct7 ); - float err8 = Bruteforce_Ulp_Error_Double( test, correct8 ); - float err9 = Bruteforce_Ulp_Error_Double( test, correct9 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) && - (!(fabsf(err5) <= f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) && - (!(fabsf(err7) <= f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - if( fabsf( err6 ) < fabsf(err ) ) - err = err6; - if( fabsf( err7 ) < fabsf(err ) ) - err = err7; - if( fabsf( err8 ) < fabsf(err ) ) - err = err8; - if( fabsf( err9 ) < fabsf(err ) ) - err = err9; + // retry per section 6.5.3.3 + if( fail && IsDoubleSubnormal( s[j] ) ) + { // look at me, + long double correct2 = f->dfunc.f_fff( 0.0, + s2[j], s3[j] ); long double correct3 = f->dfunc.f_fff( -0.0, s2[j], + s3[j] ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); + float err3 = Bruteforce_Ulp_Error_Double( + test, correct3 ); fail = fail && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) + ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) || - IsDoubleResultSubnormal( correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7, f->double_ulps ) || - IsDoubleResultSubnormal( correct8, f->double_ulps ) || IsDoubleResultSubnormal( correct9, f->double_ulps ) ) + if( IsDoubleResultSubnormal( correct2, + f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) + ) { // look at me now, fail = fail && ( test != 0.0f); if( ! fail ) err + = 0.0f; + } + + //try with first two args as zero + if( IsDoubleSubnormal( s2[j] ) ) + { // its fun to have fun, + correct2 = f->dfunc.f_fff( 0.0, 0.0, + s3[j] ); correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] ); long double + correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] ); long double correct5 = + f->dfunc.f_fff( -0.0, -0.0, s3[j] ); err2 = + Bruteforce_Ulp_Error_Double( test, correct2 ); err3 = + Bruteforce_Ulp_Error_Double( test, correct3 ); float err4 = + Bruteforce_Ulp_Error_Double( test, correct4 ); float err5 = + Bruteforce_Ulp_Error_Double( test, correct5 ); fail = fail && + ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= + f->double_ulps)) && + (!(fabsf(err4) <= + f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf( + err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) + err = err3; + if( fabsf( err4 ) < fabsf(err ) ) + err = err4; + if( fabsf( err5 ) < fabsf(err ) ) + err = err5; + + // retry per section 6.5.3.4 + if( IsDoubleResultSubnormal( correct2, + f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) + || IsDoubleResultSubnormal( correct4, f->double_ulps ) || + IsDoubleResultSubnormal( correct5, f->double_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + + if( IsDoubleSubnormal( s3[j] ) ) + { // but you have to know how! + correct2 = f->dfunc.f_fff( 0.0, 0.0, + 0.0f ); correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f ); correct4 = + f->dfunc.f_fff( 0.0, -0.0, 0.0f ); correct5 = f->dfunc.f_fff( -0.0, + -0.0, 0.0f ); long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f ); + long double correct7 = + f->dfunc.f_fff( -0.0, 0.0, -0.0f ); long double correct8 = + f->dfunc.f_fff( 0.0, -0.0, -0.0f ); long double correct9 = + f->dfunc.f_fff( -0.0, -0.0, -0.0f ); err2 = + Bruteforce_Ulp_Error_Double( test, correct2 ); err3 = + Bruteforce_Ulp_Error_Double( test, correct3 ); err4 = + Bruteforce_Ulp_Error_Double( test, correct4 ); err5 = + Bruteforce_Ulp_Error_Double( test, correct5 ); float err6 = + Bruteforce_Ulp_Error_Double( test, correct6 ); float err7 = + Bruteforce_Ulp_Error_Double( test, correct7 ); float err8 = + Bruteforce_Ulp_Error_Double( test, correct8 ); float err9 = + Bruteforce_Ulp_Error_Double( test, correct9 ); fail = fail && + ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= + f->double_ulps)) && + (!(fabsf(err4) <= + f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) && + (!(fabsf(err5) <= + f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) && + (!(fabsf(err7) <= + f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps))); if( fabsf( + err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) + err = err3; + if( fabsf( err4 ) < fabsf(err ) ) + err = err4; + if( fabsf( err5 ) < fabsf(err ) ) + err = err5; + if( fabsf( err6 ) < fabsf(err ) ) + err = err6; + if( fabsf( err7 ) < fabsf(err ) ) + err = err7; + if( fabsf( err8 ) < fabsf(err ) ) + err = err8; + if( fabsf( err9 ) < fabsf(err ) ) + err = err9; + + // retry per section 6.5.3.4 + if( IsDoubleResultSubnormal( + correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, + f->double_ulps ) || IsDoubleResultSubnormal( correct4, f->double_ulps + ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) || + IsDoubleResultSubnormal( + correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7, + f->double_ulps ) || IsDoubleResultSubnormal( correct8, f->double_ulps + ) || IsDoubleResultSubnormal( correct9, f->double_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + } + } + else if( IsDoubleSubnormal( s3[j] ) ) + { + correct2 = f->dfunc.f_fff( 0.0, s2[j], + 0.0 ); correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 ); long double + correct4 = f->dfunc.f_fff( 0.0, s2[j], -0.0 ); long double correct5 = + f->dfunc.f_fff( -0.0, s2[j], -0.0 ); err2 = + Bruteforce_Ulp_Error_Double( test, correct2 ); err3 = + Bruteforce_Ulp_Error_Double( test, correct3 ); float err4 = + Bruteforce_Ulp_Error_Double( test, correct4 ); float err5 = + Bruteforce_Ulp_Error_Double( test, correct5 ); fail = fail && + ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= + f->double_ulps)) && + (!(fabsf(err4) <= + f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf( + err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) + err = err3; + if( fabsf( err4 ) < fabsf(err ) ) + err = err4; + if( fabsf( err5 ) < fabsf(err ) ) + err = err5; + + // retry per section 6.5.3.4 + if( IsDoubleResultSubnormal( correct2, + f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) + || IsDoubleResultSubnormal( correct4, f->double_ulps ) || + IsDoubleResultSubnormal( correct5, f->double_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + } + } + else if( fail && IsDoubleSubnormal( s2[j] ) ) + { + long double correct2 = f->dfunc.f_fff( s[j], + 0.0, s3[j] ); long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j] + ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); float + err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); fail = fail && + ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= + f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if( + fabsf( err3 ) < fabsf(err ) ) err = err3; + + // retry per section 6.5.3.4 + if( IsDoubleResultSubnormal( correct2, + f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps + ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + + //try with second two args as zero + if( IsDoubleSubnormal( s3[j] ) ) + { + correct2 = f->dfunc.f_fff( s[j], 0.0, + 0.0 ); correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 ); long double + correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 ); long double correct5 = + f->dfunc.f_fff( s[j], -0.0, -0.0 ); err2 = Bruteforce_Ulp_Error_Double( + test, correct2 ); err3 = Bruteforce_Ulp_Error_Double( test, correct3 + ); float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); float + err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); fail = fail && + ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= + f->double_ulps)) && + (!(fabsf(err4) <= + f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); if( fabsf( + err2 ) < fabsf(err ) ) err = err2; if( fabsf( err3 ) < fabsf(err ) ) + err = err3; + if( fabsf( err4 ) < fabsf(err ) ) + err = err4; + if( fabsf( err5 ) < fabsf(err ) ) + err = err5; + + // retry per section 6.5.3.4 + if( IsDoubleResultSubnormal( correct2, + f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) + || IsDoubleResultSubnormal( correct4, f->double_ulps ) || + IsDoubleResultSubnormal( correct5, f->double_ulps ) ) + { + fail = fail && ( test != 0.0f); + if( ! fail ) + err = 0.0f; + } + } + } + else if( fail && IsDoubleSubnormal(s3[j]) ) + { + long double correct2 = f->dfunc.f_fff( s[j], + s2[j], 0.0 ); long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0 + ); float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); float + err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); fail = fail && + ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= + f->double_ulps))); if( fabsf( err2 ) < fabsf(err ) ) err = err2; if( + fabsf( err3 ) < fabsf(err ) ) err = err3; + + // retry per section 6.5.3.4 + if( IsDoubleResultSubnormal( correct2, + f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) + ) { fail = fail && ( test != 0.0f); if( ! fail ) @@ -900,224 +1134,147 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode) } } } - else if( IsDoubleSubnormal( s3[j] ) ) - { - correct2 = f->dfunc.f_fff( 0.0, s2[j], 0.0 ); - correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 ); - long double correct4 = f->dfunc.f_fff( 0.0, s2[j], -0.0 ); - long double correct5 = f->dfunc.f_fff( -0.0, s2[j], -0.0 ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } + if( fabsf(err ) > maxError ) + { + maxError = fabsf(err); + maxErrorVal = s[j]; + maxErrorVal2 = s2[j]; + maxErrorVal3 = s3[j]; + } + + if( fail ) + { + vlog_error( "\nERROR: %sD%s: %f ulp error at + {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], + s3[j], ((double*) gOut_Ref)[j], test ); error = -1; goto exit; } } - else if( fail && IsDoubleSubnormal( s2[j] ) ) - { - long double correct2 = f->dfunc.f_fff( s[j], 0.0, s3[j] ); - long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j] ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - - // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - - //try with second two args as zero - if( IsDoubleSubnormal( s3[j] ) ) - { - correct2 = f->dfunc.f_fff( s[j], 0.0, 0.0 ); - correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 ); - long double correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 ); - long double correct5 = f->dfunc.f_fff( s[j], -0.0, -0.0 ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - - // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - } - } - else if( fail && IsDoubleSubnormal(s3[j]) ) - { - long double correct2 = f->dfunc.f_fff( s[j], s2[j], 0.0 ); - long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0 ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - - // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) - { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; - } - } - } - - if( fabsf(err ) > maxError ) - { - maxError = fabsf(err); - maxErrorVal = s[j]; - maxErrorVal2 = s2[j]; - maxErrorVal3 = s3[j]; - } - - if( fail ) - { - vlog_error( "\nERROR: %sD%s: %f ulp error at {%a, %a, %a}: *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((double*) gOut_Ref)[j], test ); - error = -1; - goto exit; } } - } - } -*/ - if( 0 == (i & 0x0fffffff) ) + */ + if (0 == (i & 0x0fffffff)) { - vlog("." ); + vlog("."); fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "pass" ); + vlog("pass"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array double *p = (double *)gIn; double *p2 = (double *)gIn2; double *p3 = (double *)gIn3; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = DoubleFromUInt32(genrand_int32(d)); p3[j] = DoubleFromUInt32(genrand_int32(d)); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -1125,6 +1282,3 @@ exit: return error; } - - - diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp index d7f2ebf6..ca58f2e5 100644 --- a/test_conformance/math_brute_force/main.cpp +++ b/test_conformance/math_brute_force/main.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -27,116 +27,122 @@ #include "harness/parseParameters.h" #include "harness/typeWrappers.h" -#if defined( __APPLE__ ) - #include - #include - #include - #include -#elif defined( __linux__ ) - #include - #include - #include - #include +#if defined(__APPLE__) +#include +#include +#include +#include +#elif defined(__linux__) +#include +#include +#include +#include #endif -#if defined (__linux__) || (defined WIN32 && defined __MINGW32__) +#if defined(__linux__) || (defined WIN32 && defined __MINGW32__) #include #endif #include "harness/testHarness.h" -#define kPageSize 4096 -#define DOUBLE_REQUIRED_FEATURES ( CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM ) +#define kPageSize 4096 +#define DOUBLE_REQUIRED_FEATURES \ + (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO \ + | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM) -const char **gTestNames = NULL; -unsigned int gTestNameCount = 0; -char appName[ MAXPATHLEN ] = ""; -cl_device_id gDevice = NULL; -cl_context gContext = NULL; +const char **gTestNames = NULL; +unsigned int gTestNameCount = 0; +char appName[MAXPATHLEN] = ""; +cl_device_id gDevice = NULL; +cl_context gContext = NULL; cl_command_queue gQueue = NULL; -static int32_t gStartTestNumber; -static int32_t gEndTestNumber; -int gSkipCorrectnessTesting = 0; -int gStopOnError = 0; -static bool gSkipRestOfTests; -#if defined( __APPLE__ ) -int gMeasureTimes = 1; +static int32_t gStartTestNumber; +static int32_t gEndTestNumber; +int gSkipCorrectnessTesting = 0; +int gStopOnError = 0; +static bool gSkipRestOfTests; +#if defined(__APPLE__) +int gMeasureTimes = 1; #else -int gMeasureTimes = 0; +int gMeasureTimes = 0; #endif -int gReportAverageTimes = 0; -int gForceFTZ = 0; -int gWimpyMode = 0; -int gHasDouble = 0; -int gTestFloat = 1; +int gReportAverageTimes = 0; +int gForceFTZ = 0; +int gWimpyMode = 0; +int gHasDouble = 0; +int gTestFloat = 1; // This flag should be 'ON' by default and it can be changed through the command // line arguments. static int gTestFastRelaxed = 1; -/*This flag corresponds to defining if the implementation has Derived Fast Relaxed functions. - The spec does not specify ULP for derived function. The derived functions are composed of base functions which are tested for ULP, thus when this flag is enabled, - Derived functions will not be tested for ULP, as per table 7.1 of OpenCL 2.0 spec. - Since there is no way of quering the device whether it is a derived or non-derived implementation according to OpenCL 2.0 spec then it has to be changed through a command line argument. +/*This flag corresponds to defining if the implementation has Derived Fast + Relaxed functions. The spec does not specify ULP for derived function. The + derived functions are composed of base functions which are tested for ULP, + thus when this flag is enabled, Derived functions will not be tested for ULP, + as per table 7.1 of OpenCL 2.0 spec. Since there is no way of quering the + device whether it is a derived or non-derived implementation according to + OpenCL 2.0 spec then it has to be changed through a command line argument. */ -int gFastRelaxedDerived = 1; -int gToggleCorrectlyRoundedDivideSqrt = 0; -int gDeviceILogb0 = 1; -int gDeviceILogbNaN = 1; -int gCheckTininessBeforeRounding = 1; -int gIsInRTZMode = 0; -uint32_t gMaxVectorSizeIndex = VECTOR_SIZE_COUNT; -uint32_t gMinVectorSizeIndex = 0; -const char *method[] = { "Best", "Average" }; -void *gIn = NULL; -void *gIn2 = NULL; -void *gIn3 = NULL; -void *gOut_Ref = NULL; -void *gOut[VECTOR_SIZE_COUNT] = {NULL, NULL, NULL, NULL, NULL, NULL }; -void *gOut_Ref2 = NULL; -void *gOut2[VECTOR_SIZE_COUNT] = {NULL, NULL, NULL, NULL, NULL, NULL }; -cl_mem gInBuffer = NULL; -cl_mem gInBuffer2 = NULL; -cl_mem gInBuffer3 = NULL; -cl_mem gOutBuffer[VECTOR_SIZE_COUNT]= {NULL, NULL, NULL, NULL, NULL, NULL }; -cl_mem gOutBuffer2[VECTOR_SIZE_COUNT]= {NULL, NULL, NULL, NULL, NULL, NULL }; -uint32_t gComputeDevices = 0; -uint32_t gSimdSize = 1; -uint32_t gDeviceFrequency = 0; -static MTdata gMTdata; +int gFastRelaxedDerived = 1; +int gToggleCorrectlyRoundedDivideSqrt = 0; +int gDeviceILogb0 = 1; +int gDeviceILogbNaN = 1; +int gCheckTininessBeforeRounding = 1; +int gIsInRTZMode = 0; +uint32_t gMaxVectorSizeIndex = VECTOR_SIZE_COUNT; +uint32_t gMinVectorSizeIndex = 0; +const char *method[] = { "Best", "Average" }; +void *gIn = NULL; +void *gIn2 = NULL; +void *gIn3 = NULL; +void *gOut_Ref = NULL; +void *gOut[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL }; +void *gOut_Ref2 = NULL; +void *gOut2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL }; +cl_mem gInBuffer = NULL; +cl_mem gInBuffer2 = NULL; +cl_mem gInBuffer3 = NULL; +cl_mem gOutBuffer[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL }; +cl_mem gOutBuffer2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL }; +uint32_t gComputeDevices = 0; +uint32_t gSimdSize = 1; +uint32_t gDeviceFrequency = 0; +static MTdata gMTdata; cl_device_fp_config gFloatCapabilities = 0; cl_device_fp_config gDoubleCapabilities = 0; -int gWimpyReductionFactor = 32; -int gWimpyBufferSize = BUFFER_SIZE; -int gVerboseBruteForce = 0; +int gWimpyReductionFactor = 32; +int gWimpyBufferSize = BUFFER_SIZE; +int gVerboseBruteForce = 0; -static int ParseArgs( int argc, const char **argv ); -static void PrintUsage( void ); -static void PrintFunctions( void ); -test_status InitCL( cl_device_id device ); -static void ReleaseCL( void ); -static int InitILogbConstants( void ); -static int IsTininessDetectedBeforeRounding( void ); -static int IsInRTZMode( void ); //expensive. Please check gIsInRTZMode global instead. +static int ParseArgs(int argc, const char **argv); +static void PrintUsage(void); +static void PrintFunctions(void); +test_status InitCL(cl_device_id device); +static void ReleaseCL(void); +static int InitILogbConstants(void); +static int IsTininessDetectedBeforeRounding(void); +static int +IsInRTZMode(void); // expensive. Please check gIsInRTZMode global instead. -int doTest( const char* name ) +int doTest(const char *name) { - if( gSkipRestOfTests ) + if (gSkipRestOfTests) { - vlog( "Skipping function because of an earlier error.\n" ); + vlog("Skipping function because of an earlier error.\n"); return 1; } int error = 0; - const Func* func_data = NULL; + const Func *func_data = NULL; - for( size_t i = 0; i < functionListCount; i++ ) + for (size_t i = 0; i < functionListCount; i++) { - const Func* const temp_func = functionList + i; - if( strcmp( temp_func->name, name ) == 0 ) + const Func *const temp_func = functionList + i; + if (strcmp(temp_func->name, name) == 0) { - if( i < gStartTestNumber || i > gEndTestNumber ) + if (i < gStartTestNumber || i > gEndTestNumber) { - vlog( "Skipping function #%d\n", i ); + vlog("Skipping function #%d\n", i); return 0; } @@ -145,32 +151,35 @@ int doTest( const char* name ) } } - if( func_data == NULL ) + if (func_data == NULL) { - vlog( "Function '%s' doesn't exist!\n", name ); - exit( EXIT_FAILURE ); + vlog("Function '%s' doesn't exist!\n", name); + exit(EXIT_FAILURE); } - if( func_data->func.p == NULL ) + if (func_data->func.p == NULL) { - vlog( "'%s' is missing implementation, skipping function.\n", func_data->name ); + vlog("'%s' is missing implementation, skipping function.\n", + func_data->name); return 0; } // if correctly rounded divide & sqrt are supported by the implementation // then test it; otherwise skip the test - if( strcmp( func_data->name, "sqrt_cr" ) == 0 || strcmp( func_data->name, "divide_cr" ) == 0 ) + if (strcmp(func_data->name, "sqrt_cr") == 0 + || strcmp(func_data->name, "divide_cr") == 0) { - if( ( gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT ) == 0 ) + if ((gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT) == 0) { - vlog( "Correctly rounded divide and sqrt are not supported, skipping function.\n" ); + vlog("Correctly rounded divide and sqrt are not supported, " + "skipping function.\n"); return 0; } } { extern int my_ilogb(double); - if( 0 == strcmp( "ilogb", func_data->name ) ) + if (0 == strcmp("ilogb", func_data->name)) { InitILogbConstants(); } @@ -201,17 +210,17 @@ int doTest( const char* name ) } } - if( gTestFloat ) + if (gTestFloat) { gTestCount++; - vlog( "%3d: ", gTestCount ); + vlog("%3d: ", gTestCount); // Don't test with relaxed requirements. if (func_data->vtbl_ptr->TestFunc(func_data, gMTdata, false /* relaxed mode */)) { gFailCount++; error++; - if( gStopOnError ) + if (gStopOnError) { gSkipRestOfTests = true; return error; @@ -219,17 +228,18 @@ int doTest( const char* name ) } } - if( gHasDouble && NULL != func_data->vtbl_ptr->DoubleTestFunc && NULL != func_data->dfunc.p ) + if (gHasDouble && NULL != func_data->vtbl_ptr->DoubleTestFunc + && NULL != func_data->dfunc.p) { gTestCount++; - vlog( "%3d: ", gTestCount ); + vlog("%3d: ", gTestCount); // Don't test with relaxed requirements. if (func_data->vtbl_ptr->DoubleTestFunc(func_data, gMTdata, false /* relaxed mode*/)) { gFailCount++; error++; - if( gStopOnError ) + if (gStopOnError) { gSkipRestOfTests = true; return error; @@ -241,515 +251,549 @@ int doTest( const char* name ) return error; } -int test_acos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_acos(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "acos" ); + return doTest("acos"); } -int test_acosh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_acosh(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "acosh" ); + return doTest("acosh"); } -int test_acospi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_acospi(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "acospi" ); + return doTest("acospi"); } -int test_asin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_asin(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "asin" ); + return doTest("asin"); } -int test_asinh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_asinh(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "asinh" ); + return doTest("asinh"); } -int test_asinpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_asinpi(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "asinpi" ); + return doTest("asinpi"); } -int test_atan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_atan(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "atan" ); + return doTest("atan"); } -int test_atanh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_atanh(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "atanh" ); + return doTest("atanh"); } -int test_atanpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_atanpi(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "atanpi" ); + return doTest("atanpi"); } -int test_atan2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_atan2(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "atan2" ); + return doTest("atan2"); } -int test_atan2pi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_atan2pi(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "atan2pi" ); + return doTest("atan2pi"); } -int test_cbrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_cbrt(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "cbrt" ); + return doTest("cbrt"); } -int test_ceil( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_ceil(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "ceil" ); + return doTest("ceil"); } -int test_copysign( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_copysign(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "copysign" ); + return doTest("copysign"); } -int test_cos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_cos(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "cos" ); + return doTest("cos"); } -int test_cosh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_cosh(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "cosh" ); + return doTest("cosh"); } -int test_cospi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_cospi(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "cospi" ); + return doTest("cospi"); } -int test_exp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_exp(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "exp" ); + return doTest("exp"); } -int test_exp2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_exp2(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "exp2" ); + return doTest("exp2"); } -int test_exp10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_exp10(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "exp10" ); + return doTest("exp10"); } -int test_expm1( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_expm1(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "expm1" ); + return doTest("expm1"); } -int test_fabs( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_fabs(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "fabs" ); + return doTest("fabs"); } -int test_fdim( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_fdim(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "fdim" ); + return doTest("fdim"); } -int test_floor( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_floor(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "floor" ); + return doTest("floor"); } -int test_fma( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_fma(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "fma" ); + return doTest("fma"); } -int test_fmax( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_fmax(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "fmax" ); + return doTest("fmax"); } -int test_fmin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_fmin(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "fmin" ); + return doTest("fmin"); } -int test_fmod( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_fmod(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "fmod" ); + return doTest("fmod"); } -int test_fract( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_fract(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "fract" ); + return doTest("fract"); } -int test_frexp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_frexp(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "frexp" ); + return doTest("frexp"); } -int test_hypot( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_hypot(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "hypot" ); + return doTest("hypot"); } -int test_ilogb( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_ilogb(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "ilogb" ); + return doTest("ilogb"); } -int test_isequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isequal(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isequal" ); + return doTest("isequal"); } -int test_isfinite( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isfinite(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isfinite" ); + return doTest("isfinite"); } -int test_isgreater( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isgreater(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isgreater" ); + return doTest("isgreater"); } -int test_isgreaterequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isgreaterequal(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isgreaterequal" ); + return doTest("isgreaterequal"); } -int test_isinf( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isinf(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isinf" ); + return doTest("isinf"); } -int test_isless( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isless(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isless" ); + return doTest("isless"); } -int test_islessequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_islessequal(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "islessequal" ); + return doTest("islessequal"); } -int test_islessgreater( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_islessgreater(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "islessgreater" ); + return doTest("islessgreater"); } -int test_isnan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isnan(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isnan" ); + return doTest("isnan"); } -int test_isnormal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isnormal(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isnormal" ); + return doTest("isnormal"); } -int test_isnotequal( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isnotequal(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isnotequal" ); + return doTest("isnotequal"); } -int test_isordered( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isordered(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isordered" ); + return doTest("isordered"); } -int test_isunordered( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_isunordered(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "isunordered" ); + return doTest("isunordered"); } -int test_ldexp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_ldexp(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "ldexp" ); + return doTest("ldexp"); } -int test_lgamma( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_lgamma(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "lgamma" ); + return doTest("lgamma"); } -int test_lgamma_r( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_lgamma_r(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "lgamma_r" ); + return doTest("lgamma_r"); } -int test_log( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_log(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "log" ); + return doTest("log"); } -int test_log2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_log2(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "log2" ); + return doTest("log2"); } -int test_log10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_log10(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "log10" ); + return doTest("log10"); } -int test_log1p( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_log1p(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "log1p" ); + return doTest("log1p"); } -int test_logb( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_logb(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "logb" ); + return doTest("logb"); } -int test_mad( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_mad(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "mad" ); + return doTest("mad"); } -int test_maxmag( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_maxmag(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "maxmag" ); + return doTest("maxmag"); } -int test_minmag( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_minmag(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "minmag" ); + return doTest("minmag"); } -int test_modf( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_modf(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "modf" ); + return doTest("modf"); } -int test_nan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_nan(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "nan" ); + return doTest("nan"); } -int test_nextafter( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_nextafter(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "nextafter" ); + return doTest("nextafter"); } -int test_pow( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_pow(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "pow" ); + return doTest("pow"); } -int test_pown( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_pown(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "pown" ); + return doTest("pown"); } -int test_powr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_powr(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "powr" ); + return doTest("powr"); } -int test_remainder( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_remainder(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "remainder" ); + return doTest("remainder"); } -int test_remquo( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_remquo(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "remquo" ); + return doTest("remquo"); } -int test_rint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_rint(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "rint" ); + return doTest("rint"); } -int test_rootn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_rootn(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "rootn" ); + return doTest("rootn"); } -int test_round( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_round(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "round" ); + return doTest("round"); } -int test_rsqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_rsqrt(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "rsqrt" ); + return doTest("rsqrt"); } -int test_signbit( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_signbit(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "signbit" ); + return doTest("signbit"); } -int test_sin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_sin(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "sin" ); + return doTest("sin"); } -int test_sincos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_sincos(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "sincos" ); + return doTest("sincos"); } -int test_sinh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_sinh(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "sinh" ); + return doTest("sinh"); } -int test_sinpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_sinpi(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "sinpi" ); + return doTest("sinpi"); } -int test_sqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_sqrt(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "sqrt" ); + return doTest("sqrt"); } -int test_sqrt_cr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_sqrt_cr(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "sqrt_cr" ); + return doTest("sqrt_cr"); } -int test_tan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_tan(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "tan" ); + return doTest("tan"); } -int test_tanh( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_tanh(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "tanh" ); + return doTest("tanh"); } -int test_tanpi( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_tanpi(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "tanpi" ); + return doTest("tanpi"); } -int test_trunc( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_trunc(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "trunc" ); + return doTest("trunc"); } -int test_half_cos( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_cos(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_cos" ); + return doTest("half_cos"); } -int test_half_divide( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_divide(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_divide" ); + return doTest("half_divide"); } -int test_half_exp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_exp(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_exp" ); + return doTest("half_exp"); } -int test_half_exp2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_exp2(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_exp2" ); + return doTest("half_exp2"); } -int test_half_exp10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_exp10(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_exp10" ); + return doTest("half_exp10"); } -int test_half_log( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_log(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_log" ); + return doTest("half_log"); } -int test_half_log2( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_log2(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_log2" ); + return doTest("half_log2"); } -int test_half_log10( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_log10(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_log10" ); + return doTest("half_log10"); } -int test_half_powr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_powr(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_powr" ); + return doTest("half_powr"); } -int test_half_recip( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_recip(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_recip" ); + return doTest("half_recip"); } -int test_half_rsqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_rsqrt(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_rsqrt" ); + return doTest("half_rsqrt"); } -int test_half_sin( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_sin(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_sin" ); + return doTest("half_sin"); } -int test_half_sqrt( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_sqrt(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_sqrt" ); + return doTest("half_sqrt"); } -int test_half_tan( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_half_tan(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "half_tan" ); + return doTest("half_tan"); } -int test_add( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "add" ); + return doTest("add"); } -int test_subtract( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_subtract(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "subtract" ); + return doTest("subtract"); } -int test_divide( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_divide(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "divide" ); + return doTest("divide"); } -int test_divide_cr( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_divide_cr(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "divide_cr" ); + return doTest("divide_cr"); } -int test_multiply( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_multiply(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "multiply" ); + return doTest("multiply"); } -int test_assignment( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_assignment(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return doTest( "assignment" ); + return doTest("assignment"); } -int test_not( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_not(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - return doTest( "not" ); + return doTest("not"); } test_definition test_list[] = { - ADD_TEST( acos ), - ADD_TEST( acosh ), - ADD_TEST( acospi ), - ADD_TEST( asin ), - ADD_TEST( asinh ), - ADD_TEST( asinpi ), - ADD_TEST( atan ), - ADD_TEST( atanh ), - ADD_TEST( atanpi ), - ADD_TEST( atan2 ), - ADD_TEST( atan2pi ), - ADD_TEST( cbrt ), - ADD_TEST( ceil ), - ADD_TEST( copysign ), - ADD_TEST( cos ), - ADD_TEST( cosh ), - ADD_TEST( cospi ), - ADD_TEST( exp ), - ADD_TEST( exp2 ), - ADD_TEST( exp10 ), - ADD_TEST( expm1 ), - ADD_TEST( fabs ), - ADD_TEST( fdim ), - ADD_TEST( floor ), - ADD_TEST( fma ), - ADD_TEST( fmax ), - ADD_TEST( fmin ), - ADD_TEST( fmod ), - ADD_TEST( fract ), - ADD_TEST( frexp ), - ADD_TEST( hypot ), - ADD_TEST( ilogb ), - ADD_TEST( isequal ), - ADD_TEST( isfinite ), - ADD_TEST( isgreater ), - ADD_TEST( isgreaterequal ), - ADD_TEST( isinf ), - ADD_TEST( isless ), - ADD_TEST( islessequal ), - ADD_TEST( islessgreater ), - ADD_TEST( isnan ), - ADD_TEST( isnormal ), - ADD_TEST( isnotequal ), - ADD_TEST( isordered ), - ADD_TEST( isunordered ), - ADD_TEST( ldexp ), - ADD_TEST( lgamma ), - ADD_TEST( lgamma_r ), - ADD_TEST( log ), - ADD_TEST( log2 ), - ADD_TEST( log10 ), - ADD_TEST( log1p ), - ADD_TEST( logb ), - ADD_TEST( mad ), - ADD_TEST( maxmag ), - ADD_TEST( minmag ), - ADD_TEST( modf ), - ADD_TEST( nan ), - ADD_TEST( nextafter ), - ADD_TEST( pow ), - ADD_TEST( pown ), - ADD_TEST( powr ), - ADD_TEST( remainder ), - ADD_TEST( remquo ), - ADD_TEST( rint ), - ADD_TEST( rootn ), - ADD_TEST( round ), - ADD_TEST( rsqrt ), - ADD_TEST( signbit ), - ADD_TEST( sin ), - ADD_TEST( sincos ), - ADD_TEST( sinh ), - ADD_TEST( sinpi ), - ADD_TEST( sqrt ), - ADD_TEST( sqrt_cr ), - ADD_TEST( tan ), - ADD_TEST( tanh ), - ADD_TEST( tanpi ), - ADD_TEST( trunc ), - ADD_TEST( half_cos ), - ADD_TEST( half_divide ), - ADD_TEST( half_exp ), - ADD_TEST( half_exp2 ), - ADD_TEST( half_exp10 ), - ADD_TEST( half_log ), - ADD_TEST( half_log2 ), - ADD_TEST( half_log10 ), - ADD_TEST( half_powr ), - ADD_TEST( half_recip ), - ADD_TEST( half_rsqrt ), - ADD_TEST( half_sin ), - ADD_TEST( half_sqrt ), - ADD_TEST( half_tan ), - ADD_TEST( add ), - ADD_TEST( subtract ), - ADD_TEST( divide ), - ADD_TEST( divide_cr ), - ADD_TEST( multiply ), - ADD_TEST( assignment ), - ADD_TEST( not ), + ADD_TEST(acos), ADD_TEST(acosh), ADD_TEST(acospi), + ADD_TEST(asin), ADD_TEST(asinh), ADD_TEST(asinpi), + ADD_TEST(atan), ADD_TEST(atanh), ADD_TEST(atanpi), + ADD_TEST(atan2), ADD_TEST(atan2pi), ADD_TEST(cbrt), + ADD_TEST(ceil), ADD_TEST(copysign), ADD_TEST(cos), + ADD_TEST(cosh), ADD_TEST(cospi), ADD_TEST(exp), + ADD_TEST(exp2), ADD_TEST(exp10), ADD_TEST(expm1), + ADD_TEST(fabs), ADD_TEST(fdim), ADD_TEST(floor), + ADD_TEST(fma), ADD_TEST(fmax), ADD_TEST(fmin), + ADD_TEST(fmod), ADD_TEST(fract), ADD_TEST(frexp), + ADD_TEST(hypot), ADD_TEST(ilogb), ADD_TEST(isequal), + ADD_TEST(isfinite), ADD_TEST(isgreater), ADD_TEST(isgreaterequal), + ADD_TEST(isinf), ADD_TEST(isless), ADD_TEST(islessequal), + ADD_TEST(islessgreater), ADD_TEST(isnan), ADD_TEST(isnormal), + ADD_TEST(isnotequal), ADD_TEST(isordered), ADD_TEST(isunordered), + ADD_TEST(ldexp), ADD_TEST(lgamma), ADD_TEST(lgamma_r), + ADD_TEST(log), ADD_TEST(log2), ADD_TEST(log10), + ADD_TEST(log1p), ADD_TEST(logb), ADD_TEST(mad), + ADD_TEST(maxmag), ADD_TEST(minmag), ADD_TEST(modf), + ADD_TEST(nan), ADD_TEST(nextafter), ADD_TEST(pow), + ADD_TEST(pown), ADD_TEST(powr), ADD_TEST(remainder), + ADD_TEST(remquo), ADD_TEST(rint), ADD_TEST(rootn), + ADD_TEST(round), ADD_TEST(rsqrt), ADD_TEST(signbit), + ADD_TEST(sin), ADD_TEST(sincos), ADD_TEST(sinh), + ADD_TEST(sinpi), ADD_TEST(sqrt), ADD_TEST(sqrt_cr), + ADD_TEST(tan), ADD_TEST(tanh), ADD_TEST(tanpi), + ADD_TEST(trunc), ADD_TEST(half_cos), ADD_TEST(half_divide), + ADD_TEST(half_exp), ADD_TEST(half_exp2), ADD_TEST(half_exp10), + ADD_TEST(half_log), ADD_TEST(half_log2), ADD_TEST(half_log10), + ADD_TEST(half_powr), ADD_TEST(half_recip), ADD_TEST(half_rsqrt), + ADD_TEST(half_sin), ADD_TEST(half_sqrt), ADD_TEST(half_tan), + ADD_TEST(add), ADD_TEST(subtract), ADD_TEST(divide), + ADD_TEST(divide_cr), ADD_TEST(multiply), ADD_TEST(assignment), + ADD_TEST(not), }; -const int test_num = ARRAY_SIZE( test_list ); +const int test_num = ARRAY_SIZE(test_list); #pragma mark - -int main (int argc, const char * argv[]) +int main(int argc, const char *argv[]) { int error; @@ -759,60 +803,59 @@ int main (int argc, const char * argv[]) return -1; } -#if defined( __APPLE__ ) +#if defined(__APPLE__) struct timeval startTime; - gettimeofday( &startTime, NULL ); + gettimeofday(&startTime, NULL); #endif - error = ParseArgs( argc, argv ); - if( error ) - return error; + error = ParseArgs(argc, argv); + if (error) return error; // This takes a while, so prevent the machine from going to sleep. PreventSleep(); - atexit( ResumeSleep ); + atexit(ResumeSleep); - if( gSkipCorrectnessTesting ) - vlog( "*** Skipping correctness testing! ***\n\n" ); - else if( gStopOnError ) - vlog( "Stopping at first error.\n" ); + if (gSkipCorrectnessTesting) + vlog("*** Skipping correctness testing! ***\n\n"); + else if (gStopOnError) + vlog("Stopping at first error.\n"); - if( gMeasureTimes ) + if (gMeasureTimes) { - vlog( "%s times are reported at right (cycles per element):\n", method[gReportAverageTimes] ); - vlog( "\n" ); - if( gSkipCorrectnessTesting ) - vlog( " \t "); + vlog("%s times are reported at right (cycles per element):\n", + method[gReportAverageTimes]); + vlog("\n"); + if (gSkipCorrectnessTesting) + vlog(" \t "); else - vlog( " \t "); - if( gWimpyMode ) - vlog( " " ); - for( int i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - vlog( "\t float%s", sizeNames[i] ); + vlog(" \t "); + if (gWimpyMode) vlog(" "); + for (int i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + vlog("\t float%s", sizeNames[i]); } else { - vlog( " \t "); - if( gWimpyMode ) - vlog( " " ); + vlog(" \t "); + if (gWimpyMode) vlog(" "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t max_ulps" ); + if (!gSkipCorrectnessTesting) vlog("\t max_ulps"); - vlog( "\n-----------------------------------------------------------------------------------------------------------\n" ); + vlog("\n-------------------------------------------------------------------" + "----------------------------------------\n"); - gMTdata = init_genrand( gRandomSeed ); - if( gEndTestNumber == 0 ) + gMTdata = init_genrand(gRandomSeed); + if (gEndTestNumber == 0) { gEndTestNumber = functionListCount; } FPU_mode_type oldMode; - DisableFTZ( &oldMode ); + DisableFTZ(&oldMode); - int ret = runTestHarnessWithCheck( gTestNameCount, gTestNames, test_num, test_list, true, 0, InitCL ); + int ret = runTestHarnessWithCheck(gTestNameCount, gTestNames, test_num, + test_list, true, 0, InitCL); - RestoreFPState( &oldMode ); + RestoreFPState(&oldMode); free_mtdata(gMTdata); free(gTestNames); @@ -825,24 +868,24 @@ int main (int argc, const char * argv[]) ReleaseCL(); -#if defined( __APPLE__ ) +#if defined(__APPLE__) struct timeval endTime; - gettimeofday( &endTime, NULL ); - double time = (double) endTime.tv_sec - (double) startTime.tv_sec; - time += 1e-6 * ((double) endTime.tv_usec - (double) startTime.tv_usec); - vlog( "time: %f s\n", time ); + gettimeofday(&endTime, NULL); + double time = (double)endTime.tv_sec - (double)startTime.tv_sec; + time += 1e-6 * ((double)endTime.tv_usec - (double)startTime.tv_usec); + vlog("time: %f s\n", time); #endif return ret; } -static int ParseArgs( int argc, const char **argv ) +static int ParseArgs(int argc, const char **argv) { int i; - gTestNames = (const char**) calloc( argc - 1, sizeof( char*) ); - if( NULL == gTestNames ) + gTestNames = (const char **)calloc(argc - 1, sizeof(char *)); + if (NULL == gTestNames) { - vlog( "Failed to allocate memory for gTestNames array.\n" ); + vlog("Failed to allocate memory for gTestNames array.\n"); return 1; } gTestNames[0] = argv[0]; @@ -850,91 +893,64 @@ static int ParseArgs( int argc, const char **argv ) int singleThreaded = 0; { // Extract the app name - strncpy( appName, argv[0], MAXPATHLEN ); + strncpy(appName, argv[0], MAXPATHLEN); -#if defined( __APPLE__ ) +#if defined(__APPLE__) char baseName[MAXPATHLEN]; char *base = NULL; - strncpy( baseName, argv[0], MAXPATHLEN ); - base = basename( baseName ); - if( NULL != base ) + strncpy(baseName, argv[0], MAXPATHLEN); + base = basename(baseName); + if (NULL != base) { - strncpy( appName, base, sizeof( appName ) ); - appName[ sizeof( appName ) -1 ] = '\0'; + strncpy(appName, base, sizeof(appName)); + appName[sizeof(appName) - 1] = '\0'; } #endif } - vlog( "\n%s\t", appName ); - for( i = 1; i < argc; i++ ) + vlog("\n%s\t", appName); + for (i = 1; i < argc; i++) { const char *arg = argv[i]; - if( NULL == arg ) - break; + if (NULL == arg) break; - vlog( "\t%s", arg ); + vlog("\t%s", arg); int optionFound = 0; - if( arg[0] == '-' ) + if (arg[0] == '-') { - while( arg[1] != '\0' ) + while (arg[1] != '\0') { arg++; optionFound = 1; - switch( *arg ) + switch (*arg) { - case 'a': - gReportAverageTimes ^= 1; - break; + case 'a': gReportAverageTimes ^= 1; break; - case 'c': - gToggleCorrectlyRoundedDivideSqrt ^= 1; - break; + case 'c': gToggleCorrectlyRoundedDivideSqrt ^= 1; break; - case 'd': - gHasDouble ^= 1; - break; + case 'd': gHasDouble ^= 1; break; - case 'e': - gFastRelaxedDerived ^= 1; - break; + case 'e': gFastRelaxedDerived ^= 1; break; - case 'f': - gTestFloat ^= 1; - break; + case 'f': gTestFloat ^= 1; break; - case 'h': - PrintUsage(); - return -1; + case 'h': PrintUsage(); return -1; - case 'p': - PrintFunctions(); - return -1; + case 'p': PrintFunctions(); return -1; - case 'l': - gSkipCorrectnessTesting ^= 1; - break; + case 'l': gSkipCorrectnessTesting ^= 1; break; - case 'm': - singleThreaded ^= 1; - break; + case 'm': singleThreaded ^= 1; break; - case 'r': - gTestFastRelaxed ^= 1; - break; + case 'r': gTestFastRelaxed ^= 1; break; - case 's': - gStopOnError ^= 1; - break; + case 's': gStopOnError ^= 1; break; - case 't': - gMeasureTimes ^= 1; - break; + case 't': gMeasureTimes ^= 1; break; - case 'v': - gVerboseBruteForce ^= 1; - break; + case 'v': gVerboseBruteForce ^= 1; break; - case 'w': // wimpy mode + case 'w': // wimpy mode gWimpyMode ^= 1; break; @@ -942,12 +958,10 @@ static int ParseArgs( int argc, const char **argv ) parseWimpyReductionFactor(arg, gWimpyReductionFactor); break; - case 'z': - gForceFTZ ^= 1; - break; + case 'z': gForceFTZ ^= 1; break; case '1': - if( arg[1] == '6' ) + if (arg[1] == '6') { gMinVectorSizeIndex = 5; gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; @@ -960,52 +974,52 @@ static int ParseArgs( int argc, const char **argv ) } break; case '2': - gMinVectorSizeIndex = 1; - gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; - break; + gMinVectorSizeIndex = 1; + gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; + break; case '3': - gMinVectorSizeIndex = 2; - gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; - break; + gMinVectorSizeIndex = 2; + gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; + break; case '4': - gMinVectorSizeIndex = 3; - gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; - break; + gMinVectorSizeIndex = 3; + gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; + break; case '8': - gMinVectorSizeIndex = 4; - gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; - break; + gMinVectorSizeIndex = 4; + gMaxVectorSizeIndex = gMinVectorSizeIndex + 1; + break; break; default: - vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg ); + vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg); PrintUsage(); return -1; } } } - if( ! optionFound ) + if (!optionFound) { char *t = NULL; - long number = strtol( arg, &t, 0 ); - if( t != arg ) + long number = strtol(arg, &t, 0); + if (t != arg) { - if( 0 == gStartTestNumber ) - gStartTestNumber = (int32_t) number; + if (0 == gStartTestNumber) + gStartTestNumber = (int32_t)number; else - gEndTestNumber = gStartTestNumber + (int32_t) number; + gEndTestNumber = gStartTestNumber + (int32_t)number; } else { // Make sure this is a valid name unsigned int k; - for (k=0; kname) == 0) { - gTestNames[ gTestNameCount ] = arg; + gTestNames[gTestNameCount] = arg; gTestNameCount++; break; } @@ -1021,118 +1035,141 @@ static int ParseArgs( int argc, const char **argv ) } // Check for the wimpy mode environment variable - if (getenv("CL_WIMPY_MODE")) { - vlog( "\n" ); - vlog( "*** Detected CL_WIMPY_MODE env ***\n" ); - gWimpyMode = 1; + if (getenv("CL_WIMPY_MODE")) + { + vlog("\n"); + vlog("*** Detected CL_WIMPY_MODE env ***\n"); + gWimpyMode = 1; } - vlog( "\nTest binary built %s %s\n", __DATE__, __TIME__ ); + vlog("\nTest binary built %s %s\n", __DATE__, __TIME__); PrintArch(); - if( gWimpyMode ) + if (gWimpyMode) { - vlog( "\n" ); - vlog( "*** WARNING: Testing in Wimpy mode! ***\n" ); - vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" ); - vlog( "*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor ); + vlog("\n"); + vlog("*** WARNING: Testing in Wimpy mode! ***\n"); + vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n"); + vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", + gWimpyReductionFactor); } - if( singleThreaded ) - SetThreadCount(1); + if (singleThreaded) SetThreadCount(1); return 0; } -static void PrintFunctions ( void ) +static void PrintFunctions(void) { - vlog( "\nMath function names:\n" ); - for( int i = 0; i < functionListCount; i++ ) - { - vlog( "\t%s\n", functionList[ i ].name ); - } + vlog("\nMath function names:\n"); + for (int i = 0; i < functionListCount; i++) + { + vlog("\t%s\n", functionList[i].name); + } } -static void PrintUsage( void ) +static void PrintUsage(void) { - vlog( "%s [-acglstz]: \n", appName ); - vlog( "\toptions:\n" ); - vlog( "\t\t-a\tReport average times instead of best times\n" ); - vlog( "\t\t-c\tToggle test fp correctly rounded divide and sqrt (Default: off)\n"); - vlog( "\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 on)\n" ); - vlog( "\t\t-f\tToggle float precision testing. (Default: on)\n" ); - vlog( "\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n" ); - vlog( "\t\t-e\tToggle test as derived implementations for fast relaxed math precision. (Default: on)\n" ); - vlog( "\t\t-h\tPrint this message and quit\n" ); - vlog( "\t\t-p\tPrint all math function names and quit\n" ); - vlog( "\t\t-l\tlink check only (make sure functions are present, skip accuracy checks.)\n" ); - vlog( "\t\t-m\tToggle run multi-threaded. (Default: on) )\n" ); - vlog( "\t\t-s\tStop on error\n" ); - vlog( "\t\t-t\tToggle timing (on by default)\n" ); - vlog( "\t\t-w\tToggle Wimpy Mode, * Not a valid test * \n"); - vlog( "\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-10, default factor(%u)\n",gWimpyReductionFactor ); - vlog( "\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by device capabilities by default.)\n" ); - vlog( "\t\t-v\tToggle Verbosity (Default: off)\n "); - vlog( "\t\t-#\tTest only vector sizes #, e.g. \"-1\" tests scalar only, \"-16\" tests 16-wide vectors only.\n" ); - vlog( "\n\tYou may also pass a number instead of a function name.\n" ); - vlog( "\tThis causes the first N tests to be skipped. The tests are numbered.\n" ); - vlog( "\tIf you pass a second number, that is the number tests to run after the first one.\n" ); - vlog( "\tA name list may be used in conjunction with a number range. In that case,\n" ); - vlog( "\tonly the named cases in the number range will run.\n" ); - vlog( "\tYou may also choose to pass no arguments, in which case all tests will be run.\n" ); - vlog( "\tYou may pass CL_DEVICE_TYPE_CPU/GPU/ACCELERATOR to select the device.\n" ); - vlog( "\n" ); + vlog("%s [-acglstz]: \n", appName); + vlog("\toptions:\n"); + vlog("\t\t-a\tReport average times instead of best times\n"); + vlog("\t\t-c\tToggle test fp correctly rounded divide and sqrt (Default: " + "off)\n"); + vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 " + "on)\n"); + vlog("\t\t-f\tToggle float precision testing. (Default: on)\n"); + vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n"); + vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math " + "precision. (Default: on)\n"); + vlog("\t\t-h\tPrint this message and quit\n"); + vlog("\t\t-p\tPrint all math function names and quit\n"); + vlog("\t\t-l\tlink check only (make sure functions are present, skip " + "accuracy checks.)\n"); + vlog("\t\t-m\tToggle run multi-threaded. (Default: on) )\n"); + vlog("\t\t-s\tStop on error\n"); + vlog("\t\t-t\tToggle timing (on by default)\n"); + vlog("\t\t-w\tToggle Wimpy Mode, * Not a valid test * \n"); + vlog("\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is " + "1-10, default factor(%u)\n", + gWimpyReductionFactor); + vlog("\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by " + "device capabilities by default.)\n"); + vlog("\t\t-v\tToggle Verbosity (Default: off)\n "); + vlog("\t\t-#\tTest only vector sizes #, e.g. \"-1\" tests scalar only, " + "\"-16\" tests 16-wide vectors only.\n"); + vlog("\n\tYou may also pass a number instead of a function name.\n"); + vlog("\tThis causes the first N tests to be skipped. The tests are " + "numbered.\n"); + vlog("\tIf you pass a second number, that is the number tests to run after " + "the first one.\n"); + vlog("\tA name list may be used in conjunction with a number range. In " + "that case,\n"); + vlog("\tonly the named cases in the number range will run.\n"); + vlog("\tYou may also choose to pass no arguments, in which case all tests " + "will be run.\n"); + vlog("\tYou may pass CL_DEVICE_TYPE_CPU/GPU/ACCELERATOR to select the " + "device.\n"); + vlog("\n"); } -static void CL_CALLBACK bruteforce_notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data) +static void CL_CALLBACK bruteforce_notify_callback(const char *errinfo, + const void *private_info, + size_t cb, void *user_data) { - vlog( "%s (%p, %zd, %p)\n", errinfo, private_info, cb, user_data ); + vlog("%s (%p, %zd, %p)\n", errinfo, private_info, cb, user_data); } -test_status InitCL( cl_device_id device ) +test_status InitCL(cl_device_id device) { int error; uint32_t i; - size_t configSize = sizeof( gComputeDevices ); + size_t configSize = sizeof(gComputeDevices); cl_device_type device_type; - error = clGetDeviceInfo( device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL ); - if( error ) + error = clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(device_type), + &device_type, NULL); + if (error) { - print_error( error, "Unable to get device type" ); + print_error(error, "Unable to get device type"); return TEST_FAIL; } gDevice = device; - if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) ) + if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_COMPUTE_UNITS, + configSize, &gComputeDevices, NULL))) gComputeDevices = 1; // Check extensions - if(is_extension_available(gDevice, "cl_khr_fp64")) + if (is_extension_available(gDevice, "cl_khr_fp64")) { gHasDouble ^= 1; -#if defined( CL_DEVICE_DOUBLE_FP_CONFIG ) - if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(gDoubleCapabilities), &gDoubleCapabilities, NULL))) +#if defined(CL_DEVICE_DOUBLE_FP_CONFIG) + if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_DOUBLE_FP_CONFIG, + sizeof(gDoubleCapabilities), + &gDoubleCapabilities, NULL))) { - vlog_error( "ERROR: Unable to get device CL_DEVICE_DOUBLE_FP_CONFIG. (%d)\n", error ); + vlog_error("ERROR: Unable to get device " + "CL_DEVICE_DOUBLE_FP_CONFIG. (%d)\n", + error); return TEST_FAIL; } - if( DOUBLE_REQUIRED_FEATURES != (gDoubleCapabilities & DOUBLE_REQUIRED_FEATURES) ) + if (DOUBLE_REQUIRED_FEATURES + != (gDoubleCapabilities & DOUBLE_REQUIRED_FEATURES)) { std::string list; if (0 == (gDoubleCapabilities & CL_FP_FMA)) list += "CL_FP_FMA, "; - if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_NEAREST) ) + if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_NEAREST)) list += "CL_FP_ROUND_TO_NEAREST, "; - if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_ZERO) ) + if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_ZERO)) list += "CL_FP_ROUND_TO_ZERO, "; - if( 0 == (gDoubleCapabilities & CL_FP_ROUND_TO_INF) ) + if (0 == (gDoubleCapabilities & CL_FP_ROUND_TO_INF)) list += "CL_FP_ROUND_TO_INF, "; - if( 0 == (gDoubleCapabilities & CL_FP_INF_NAN) ) + if (0 == (gDoubleCapabilities & CL_FP_INF_NAN)) list += "CL_FP_INF_NAN, "; - if( 0 == (gDoubleCapabilities & CL_FP_DENORM) ) + if (0 == (gDoubleCapabilities & CL_FP_DENORM)) list += "CL_FP_DENORM, "; vlog_error("ERROR: required double features are missing: %s\n", list.c_str()); @@ -1140,100 +1177,104 @@ test_status InitCL( cl_device_id device ) return TEST_FAIL; } #else - vlog_error( "FAIL: device says it supports cl_khr_fp64 but CL_DEVICE_DOUBLE_FP_CONFIG is not in the headers!\n" ); + vlog_error("FAIL: device says it supports cl_khr_fp64 but " + "CL_DEVICE_DOUBLE_FP_CONFIG is not in the headers!\n"); return TEST_FAIL; #endif } - configSize = sizeof( gDeviceFrequency ); - if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) ) + configSize = sizeof(gDeviceFrequency); + if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, + configSize, &gDeviceFrequency, NULL))) gDeviceFrequency = 0; - if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(gFloatCapabilities), &gFloatCapabilities, NULL))) + if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG, + sizeof(gFloatCapabilities), + &gFloatCapabilities, NULL))) { - vlog_error( "ERROR: Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n", error ); + vlog_error( + "ERROR: Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n", + error); return TEST_FAIL; } - gContext = clCreateContext( NULL, 1, &gDevice, bruteforce_notify_callback, NULL, &error ); - if( NULL == gContext || error ) + gContext = clCreateContext(NULL, 1, &gDevice, bruteforce_notify_callback, + NULL, &error); + if (NULL == gContext || error) { - vlog_error( "clCreateContext failed. (%d) \n", error ); + vlog_error("clCreateContext failed. (%d) \n", error); return TEST_FAIL; } gQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == gQueue || error ) + if (NULL == gQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); return TEST_FAIL; } -#if defined( __APPLE__ ) +#if defined(__APPLE__) // FIXME: use clProtectedArray #endif - //Allocate buffers + // Allocate buffers cl_uint min_alignment = 0; - error = clGetDeviceInfo (gDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), (void*)&min_alignment, NULL); + error = clGetDeviceInfo(gDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(cl_uint), (void *)&min_alignment, NULL); if (CL_SUCCESS != error) { - vlog_error( "clGetDeviceInfo failed. (%d)\n", error ); + vlog_error("clGetDeviceInfo failed. (%d)\n", error); return TEST_FAIL; } - min_alignment >>= 3; // convert bits to bytes + min_alignment >>= 3; // convert bits to bytes - gIn = align_malloc( BUFFER_SIZE, min_alignment ); - if( NULL == gIn ) - return TEST_FAIL; - gIn2 = align_malloc( BUFFER_SIZE, min_alignment ); - if( NULL == gIn2 ) - return TEST_FAIL; - gIn3 = align_malloc( BUFFER_SIZE, min_alignment ); - if( NULL == gIn3 ) - return TEST_FAIL; - gOut_Ref = align_malloc( BUFFER_SIZE, min_alignment ); - if( NULL == gOut_Ref ) - return TEST_FAIL; - gOut_Ref2 = align_malloc( BUFFER_SIZE, min_alignment ); - if( NULL == gOut_Ref2 ) - return TEST_FAIL; + gIn = align_malloc(BUFFER_SIZE, min_alignment); + if (NULL == gIn) return TEST_FAIL; + gIn2 = align_malloc(BUFFER_SIZE, min_alignment); + if (NULL == gIn2) return TEST_FAIL; + gIn3 = align_malloc(BUFFER_SIZE, min_alignment); + if (NULL == gIn3) return TEST_FAIL; + gOut_Ref = align_malloc(BUFFER_SIZE, min_alignment); + if (NULL == gOut_Ref) return TEST_FAIL; + gOut_Ref2 = align_malloc(BUFFER_SIZE, min_alignment); + if (NULL == gOut_Ref2) return TEST_FAIL; - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - gOut[i] = align_malloc( BUFFER_SIZE, min_alignment ); - if( NULL == gOut[i] ) - return TEST_FAIL; - gOut2[i] = align_malloc( BUFFER_SIZE, min_alignment ); - if( NULL == gOut2[i] ) - return TEST_FAIL; + gOut[i] = align_malloc(BUFFER_SIZE, min_alignment); + if (NULL == gOut[i]) return TEST_FAIL; + gOut2[i] = align_malloc(BUFFER_SIZE, min_alignment); + if (NULL == gOut2[i]) return TEST_FAIL; } cl_mem_flags device_flags = CL_MEM_READ_ONLY; // save a copy on the host device to make this go faster - if( CL_DEVICE_TYPE_CPU == device_type ) + if (CL_DEVICE_TYPE_CPU == device_type) device_flags |= CL_MEM_USE_HOST_PTR; - else - device_flags |= CL_MEM_COPY_HOST_PTR; + else + device_flags |= CL_MEM_COPY_HOST_PTR; // setup input buffers - gInBuffer = clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn, &error); - if( gInBuffer == NULL || error ) + gInBuffer = + clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn, &error); + if (gInBuffer == NULL || error) { - vlog_error( "clCreateBuffer1 failed for input (%d)\n", error ); + vlog_error("clCreateBuffer1 failed for input (%d)\n", error); return TEST_FAIL; } - gInBuffer2 = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gIn2, &error ); - if( gInBuffer2 == NULL || error ) + gInBuffer2 = + clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn2, &error); + if (gInBuffer2 == NULL || error) { - vlog_error( "clCreateArray2 failed for input (%d)\n" , error ); + vlog_error("clCreateArray2 failed for input (%d)\n", error); return TEST_FAIL; } - gInBuffer3 = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gIn3, &error ); - if( gInBuffer3 == NULL || error) + gInBuffer3 = + clCreateBuffer(gContext, device_flags, BUFFER_SIZE, gIn3, &error); + if (gInBuffer3 == NULL || error) { - vlog_error( "clCreateArray3 failed for input (%d)\n", error ); + vlog_error("clCreateArray3 failed for input (%d)\n", error); return TEST_FAIL; } @@ -1241,38 +1282,40 @@ test_status InitCL( cl_device_id device ) // setup output buffers device_flags = CL_MEM_READ_WRITE; // save a copy on the host device to make this go faster - if( CL_DEVICE_TYPE_CPU == device_type ) + if (CL_DEVICE_TYPE_CPU == device_type) device_flags |= CL_MEM_USE_HOST_PTR; - else - device_flags |= CL_MEM_COPY_HOST_PTR; - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + else + device_flags |= CL_MEM_COPY_HOST_PTR; + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - gOutBuffer[i] = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gOut[i], &error ); - if( gOutBuffer[i] == NULL || error ) + gOutBuffer[i] = clCreateBuffer(gContext, device_flags, BUFFER_SIZE, + gOut[i], &error); + if (gOutBuffer[i] == NULL || error) { - vlog_error( "clCreateArray failed for output (%d)\n", error ); + vlog_error("clCreateArray failed for output (%d)\n", error); return TEST_FAIL; } - gOutBuffer2[i] = clCreateBuffer( gContext, device_flags, BUFFER_SIZE, gOut2[i], &error ); - if( gOutBuffer2[i] == NULL || error) + gOutBuffer2[i] = clCreateBuffer(gContext, device_flags, BUFFER_SIZE, + gOut2[i], &error); + if (gOutBuffer2[i] == NULL || error) { - vlog_error( "clCreateArray2 failed for output (%d)\n", error ); + vlog_error("clCreateArray2 failed for output (%d)\n", error); return TEST_FAIL; } } // we are embedded, check current rounding mode - if( gIsEmbedded ) + if (gIsEmbedded) { gIsInRTZMode = IsInRTZMode(); } - //Check tininess detection + // Check tininess detection IsTininessDetectedBeforeRounding(); cl_platform_id platform; int err = clGetPlatformIDs(1, &platform, NULL); - if( err ) + if (err) { print_error(err, "clGetPlatformIDs failed"); return TEST_FAIL; @@ -1280,78 +1323,97 @@ test_status InitCL( cl_device_id device ) char c[1024]; static const char *no_yes[] = { "NO", "YES" }; - vlog( "\nCompute Device info:\n" ); + vlog("\nCompute Device info:\n"); clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(c), &c, NULL); - vlog( "\tPlatform Version: %s\n", c ); + vlog("\tPlatform Version: %s\n", c); clGetDeviceInfo(gDevice, CL_DEVICE_NAME, sizeof(c), &c, NULL); - vlog( "\tDevice Name: %s\n", c ); + vlog("\tDevice Name: %s\n", c); clGetDeviceInfo(gDevice, CL_DEVICE_VENDOR, sizeof(c), &c, NULL); - vlog( "\tVendor: %s\n", c ); + vlog("\tVendor: %s\n", c); clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(c), &c, NULL); - vlog( "\tDevice Version: %s\n", c ); + vlog("\tDevice Version: %s\n", c); clGetDeviceInfo(gDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL); - vlog( "\tCL C Version: %s\n", c ); + vlog("\tCL C Version: %s\n", c); clGetDeviceInfo(gDevice, CL_DRIVER_VERSION, sizeof(c), &c, NULL); - vlog( "\tDriver Version: %s\n", c ); - vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency ); - vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & gFloatCapabilities)] ); - vlog( "\tCorrectly rounded divide and sqrt supported for floats? %s\n", no_yes[0 != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)] ); - if( gToggleCorrectlyRoundedDivideSqrt ) + vlog("\tDriver Version: %s\n", c); + vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency); + vlog("\tSubnormal values supported for floats? %s\n", + no_yes[0 != (CL_FP_DENORM & gFloatCapabilities)]); + vlog("\tCorrectly rounded divide and sqrt supported for floats? %s\n", + no_yes[0 + != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)]); + if (gToggleCorrectlyRoundedDivideSqrt) { gFloatCapabilities ^= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; } - vlog( "\tTesting with correctly rounded float divide and sqrt? %s\n", no_yes[0 != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)] ); - vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities)] ); - vlog( "\tTesting single precision? %s\n", no_yes[0 != gTestFloat] ); - vlog( "\tTesting fast relaxed math? %s\n", no_yes[0 != gTestFastRelaxed] ); - if(gTestFastRelaxed) + vlog("\tTesting with correctly rounded float divide and sqrt? %s\n", + no_yes[0 + != (CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT & gFloatCapabilities)]); + vlog("\tTesting with FTZ mode ON for floats? %s\n", + no_yes[0 != gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities)]); + vlog("\tTesting single precision? %s\n", no_yes[0 != gTestFloat]); + vlog("\tTesting fast relaxed math? %s\n", no_yes[0 != gTestFastRelaxed]); + if (gTestFastRelaxed) { - vlog( "\tFast relaxed math has derived implementations? %s\n", no_yes[0 != gFastRelaxedDerived] ); + vlog("\tFast relaxed math has derived implementations? %s\n", + no_yes[0 != gFastRelaxedDerived]); } - vlog( "\tTesting double precision? %s\n", no_yes[0 != gHasDouble] ); - if( sizeof( long double) == sizeof( double ) && gHasDouble ) + vlog("\tTesting double precision? %s\n", no_yes[0 != gHasDouble]); + if (sizeof(long double) == sizeof(double) && gHasDouble) { - vlog( "\n\t\tWARNING: Host system long double does not have better precision than double!\n" ); - vlog( "\t\t All double results that do not match the reference result have their reported\n" ); - vlog( "\t\t error inflated by 0.5 ulps to account for the fact that this system\n" ); - vlog( "\t\t can not accurately represent the right result to an accuracy closer\n" ); - vlog( "\t\t than half an ulp. See comments in Bruteforce_Ulp_Error_Double() for more details.\n\n" ); + vlog("\n\t\tWARNING: Host system long double does not have better " + "precision than double!\n"); + vlog("\t\t All double results that do not match the reference " + "result have their reported\n"); + vlog("\t\t error inflated by 0.5 ulps to account for the fact " + "that this system\n"); + vlog("\t\t can not accurately represent the right result to an " + "accuracy closer\n"); + vlog("\t\t than half an ulp. See comments in " + "Bruteforce_Ulp_Error_Double() for more details.\n\n"); } - vlog( "\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded] ); - if( gIsEmbedded ) - vlog( "\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode] ); - vlog( "\tTininess is detected before rounding? %s\n", no_yes[0 != gCheckTininessBeforeRounding] ); - vlog( "\tWorker threads: %d\n", GetThreadCount() ); - vlog( "\tTesting vector sizes:" ); - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - vlog( "\t%d", sizeValues[i] ); + vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]); + if (gIsEmbedded) + vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]); + vlog("\tTininess is detected before rounding? %s\n", + no_yes[0 != gCheckTininessBeforeRounding]); + vlog("\tWorker threads: %d\n", GetThreadCount()); + vlog("\tTesting vector sizes:"); + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + vlog("\t%d", sizeValues[i]); vlog("\n"); vlog("\tVerbose? %s\n", no_yes[0 != gVerboseBruteForce]); - vlog( "\n\n" ); + vlog("\n\n"); - // Check to see if we are using single threaded mode on other than a 1.0 device - if (getenv( "CL_TEST_SINGLE_THREADED" )) { + // Check to see if we are using single threaded mode on other than a 1.0 + // device + if (getenv("CL_TEST_SINGLE_THREADED")) + { - char device_version[1024] = { 0 }; - clGetDeviceInfo( gDevice, CL_DEVICE_VERSION, sizeof(device_version), device_version, NULL ); + char device_version[1024] = { 0 }; + clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(device_version), + device_version, NULL); - if (strcmp("OpenCL 1.0 ",device_version)) { - vlog("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n"); - } + if (strcmp("OpenCL 1.0 ", device_version)) + { + vlog("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. " + "Running single threaded.\n"); + } } return TEST_PASS; } -static void ReleaseCL( void ) +static void ReleaseCL(void) { uint32_t i; clReleaseMemObject(gInBuffer); clReleaseMemObject(gInBuffer2); clReleaseMemObject(gInBuffer3); - for ( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { clReleaseMemObject(gOutBuffer[i]); clReleaseMemObject(gOutBuffer2[i]); } @@ -1364,25 +1426,27 @@ static void ReleaseCL( void ) align_free(gOut_Ref); align_free(gOut_Ref2); - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { align_free(gOut[i]); align_free(gOut2[i]); } } -void _LogBuildError( cl_program p, int line, const char *file ) +void _LogBuildError(cl_program p, int line, const char *file) { char the_log[2048] = ""; - vlog_error( "%s:%d: Build Log:\n", file, line ); - if( 0 == clGetProgramBuildInfo(p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(the_log), the_log, NULL) ) - vlog_error( "%s", the_log ); + vlog_error("%s:%d: Build Log:\n", file, line); + if (0 + == clGetProgramBuildInfo(p, gDevice, CL_PROGRAM_BUILD_LOG, + sizeof(the_log), the_log, NULL)) + vlog_error("%s", the_log); else - vlog_error( "*** Error getting build log for program %p\n", p ); + vlog_error("*** Error getting build log for program %p\n", p); } -int InitILogbConstants( void ) +int InitILogbConstants(void) { int error; const char *kernelSource = @@ -1408,7 +1472,9 @@ int InitILogbConstants( void ) clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]), &gOutBuffer[gMinVectorSizeIndex]))) { - vlog_error( "Error: Unable to set kernel arg to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error ); + vlog_error("Error: Unable to set kernel arg to get FP_ILOGB0 and " + "FP_ILOGBNAN for the device. Err = %d", + error); return error; } @@ -1416,14 +1482,23 @@ int InitILogbConstants( void ) if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0, NULL, NULL))) { - vlog_error( "Error: Unable to execute kernel to get FP_ILOGB0 and FP_ILOGBNAN for the device. Err = %d", error ); + vlog_error("Error: Unable to execute kernel to get FP_ILOGB0 and " + "FP_ILOGBNAN for the device. Err = %d", + error); return error; } - struct{ cl_int ilogb0, ilogbnan; }data; - if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL))) + struct { - vlog_error( "Error: unable to read FP_ILOGB0 and FP_ILOGBNAN from the device. Err = %d", error ); + cl_int ilogb0, ilogbnan; + } data; + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex], + CL_TRUE, 0, sizeof(data), &data, 0, NULL, + NULL))) + { + vlog_error("Error: unable to read FP_ILOGB0 and FP_ILOGBNAN from the " + "device. Err = %d", + error); return error; } @@ -1433,7 +1508,7 @@ int InitILogbConstants( void ) return 0; } -int IsTininessDetectedBeforeRounding( void ) +int IsTininessDetectedBeforeRounding(void) { int error; const char *kernelSource = @@ -1449,7 +1524,8 @@ int IsTininessDetectedBeforeRounding( void ) error = create_single_kernel_helper(gContext, &query, &kernel, 1, &kernelSource, "IsTininessDetectedBeforeRounding"); - if (error != CL_SUCCESS) { + if (error != CL_SUCCESS) + { vlog_error("Error: Unable to create kernel to detect how tininess is " "detected for the device. (%d)", error); @@ -1460,7 +1536,9 @@ int IsTininessDetectedBeforeRounding( void ) clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]), &gOutBuffer[gMinVectorSizeIndex]))) { - vlog_error( "Error: Unable to set kernel arg to detect how tininess is detected for the device. Err = %d", error ); + vlog_error("Error: Unable to set kernel arg to detect how tininess is " + "detected for the device. Err = %d", + error); return error; } @@ -1468,14 +1546,23 @@ int IsTininessDetectedBeforeRounding( void ) if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0, NULL, NULL))) { - vlog_error( "Error: Unable to execute kernel to detect how tininess is detected for the device. Err = %d", error ); + vlog_error("Error: Unable to execute kernel to detect how tininess is " + "detected for the device. Err = %d", + error); return error; } - struct{ cl_uint f; }data; - if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL))) + struct { - vlog_error( "Error: unable to read result from tininess test from the device. Err = %d", error ); + cl_uint f; + } data; + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex], + CL_TRUE, 0, sizeof(data), &data, 0, NULL, + NULL))) + { + vlog_error("Error: unable to read result from tininess test from the " + "device. Err = %d", + error); return error; } @@ -1491,14 +1578,14 @@ int MakeKernel(const char **c, cl_uint count, const char *name, cl_kernel *k, int error = 0; char options[200] = ""; - if( gForceFTZ ) + if (gForceFTZ) { - strcat(options," -cl-denorms-are-zero"); + strcat(options, " -cl-denorms-are-zero"); } if (relaxedMode) { - strcat(options, " -cl-fast-relaxed-math"); + strcat(options, " -cl-fast-relaxed-math"); } error = @@ -1522,39 +1609,41 @@ int MakeKernels(const char **c, cl_uint count, const char *name, if (gForceFTZ) { - strcat(options," -cl-denorms-are-zero "); + strcat(options, " -cl-denorms-are-zero "); } - if( gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT ) + if (gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT) { - strcat(options," -cl-fp32-correctly-rounded-divide-sqrt "); + strcat(options, " -cl-fp32-correctly-rounded-divide-sqrt "); } if (relaxedMode) { - strcat(options, " -cl-fast-relaxed-math"); + strcat(options, " -cl-fast-relaxed-math"); } - error = create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options); - if ( error != CL_SUCCESS ) + error = + create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options); + if (error != CL_SUCCESS) { - vlog_error( "\t\tFAILED -- Failed to create program. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create program. (%d)\n", error); return error; } - memset( k, 0, kernel_count * sizeof( *k) ); - for( i = 0; i< kernel_count; i++ ) + memset(k, 0, kernel_count * sizeof(*k)); + for (i = 0; i < kernel_count; i++) { - k[i] = clCreateKernel( *p, name, &error ); - if( NULL == k[i]|| error ) + k[i] = clCreateKernel(*p, name, &error); + if (NULL == k[i] || error) { - char buffer[2048] = ""; + char buffer[2048] = ""; vlog_error("\t\tFAILED -- clCreateKernel() failed: (%d)\n", error); - clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); + clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG, + sizeof(buffer), buffer, NULL); vlog_error("Log: %s\n", buffer); - clReleaseProgram( *p ); + clReleaseProgram(*p); return error; } } @@ -1563,7 +1652,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name, } -static int IsInRTZMode( void ) +static int IsInRTZMode(void) { int error; const char *kernelSource = @@ -1578,7 +1667,8 @@ static int IsInRTZMode( void ) clKernelWrapper kernel; error = create_single_kernel_helper(gContext, &query, &kernel, 1, &kernelSource, "GetRoundingMode"); - if (error != CL_SUCCESS) { + if (error != CL_SUCCESS) + { vlog_error("Error: Unable to create kernel to detect RTZ mode for the " "device. (%d)", error); @@ -1589,7 +1679,9 @@ static int IsInRTZMode( void ) clSetKernelArg(kernel, 0, sizeof(gOutBuffer[gMinVectorSizeIndex]), &gOutBuffer[gMinVectorSizeIndex]))) { - vlog_error( "Error: Unable to set kernel arg to detect RTZ mode for the device. Err = %d", error ); + vlog_error("Error: Unable to set kernel arg to detect RTZ mode for the " + "device. Err = %d", + error); return error; } @@ -1597,14 +1689,23 @@ static int IsInRTZMode( void ) if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &dim, NULL, 0, NULL, NULL))) { - vlog_error( "Error: Unable to execute kernel to detect RTZ mode for the device. Err = %d", error ); + vlog_error("Error: Unable to execute kernel to detect RTZ mode for the " + "device. Err = %d", + error); return error; } - struct{ cl_int isRTZ; }data; - if(( error = clEnqueueReadBuffer( gQueue, gOutBuffer[gMinVectorSizeIndex], CL_TRUE, 0, sizeof( data ), &data, 0, NULL, NULL))) + struct { - vlog_error( "Error: unable to read RTZ mode data from the device. Err = %d", error ); + cl_int isRTZ; + } data; + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[gMinVectorSizeIndex], + CL_TRUE, 0, sizeof(data), &data, 0, NULL, + NULL))) + { + vlog_error( + "Error: unable to read RTZ mode data from the device. Err = %d", + error); return error; } @@ -1613,46 +1714,54 @@ static int IsInRTZMode( void ) #pragma mark - -const char *sizeNames[ VECTOR_SIZE_COUNT] = { "", "2", "3", "4", "8", "16" }; -const int sizeValues[ VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 }; +const char *sizeNames[VECTOR_SIZE_COUNT] = { "", "2", "3", "4", "8", "16" }; +const int sizeValues[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 }; -// TODO: There is another version of Ulp_Error_Double defined in test_common/harness/errorHelpers.c -float Bruteforce_Ulp_Error_Double( double test, long double reference ) +// TODO: There is another version of Ulp_Error_Double defined in +// test_common/harness/errorHelpers.c +float Bruteforce_Ulp_Error_Double(double test, long double reference) { -//Check for Non-power-of-two and NaN + // Check for Non-power-of-two and NaN - // Note: This function presumes that someone has already tested whether the result is correctly, - // rounded before calling this function. That test: - // - // if( (float) reference == test ) - // return 0.0f; - // - // would ensure that cases like fabs(reference) > FLT_MAX are weeded out before we get here. - // Otherwise, we'll return inf ulp error here, for what are otherwise correctly rounded - // results. + // Note: This function presumes that someone has already tested whether the + // result is correctly, rounded before calling this function. That test: + // + // if( (float) reference == test ) + // return 0.0f; + // + // would ensure that cases like fabs(reference) > FLT_MAX are weeded out + // before we get here. Otherwise, we'll return inf ulp error here, for what + // are otherwise correctly rounded results. - // Deal with long double = double - // On most systems long double is a higher precision type than double. They provide either - // a 80-bit or greater floating point type, or they provide a head-tail double double format. - // That is sufficient to represent the accuracy of a floating point result to many more bits - // than double and we can calculate sub-ulp errors. This is the standard system for which this - // test suite is designed. - // - // On some systems double and long double are the same thing. Then we run into a problem, - // because our representation of the infinitely precise result (passed in as reference above) - // can be off by as much as a half double precision ulp itself. In this case, we inflate the - // reported error by half an ulp to take this into account. A more correct and permanent fix - // would be to undertake refactoring the reference code to return results in this format: - // - // typedef struct DoubleReference - // { // true value = correctlyRoundedResult + ulps * ulp(correctlyRoundedResult) (infinitely precise) - // double correctlyRoundedResult; // as best we can - // double ulps; // plus a fractional amount to account for the difference - // }DoubleReference; // between infinitely precise result and correctlyRoundedResult, in units of ulps. - // - // This would provide a useful higher-than-double precision format for everyone that we can use, - // and would solve a few problems with representing absolute errors below DBL_MIN and over DBL_MAX for systems - // that use a head to tail double double for long double. + // Deal with long double = double + // On most systems long double is a higher precision type than double. They + // provide either a 80-bit or greater floating point type, or they provide a + // head-tail double double format. That is sufficient to represent the + // accuracy of a floating point result to many more bits than double and we + // can calculate sub-ulp errors. This is the standard system for which this + // test suite is designed. + // + // On some systems double and long double are the same thing. Then we run + // into a problem, because our representation of the infinitely precise + // result (passed in as reference above) can be off by as much as a half + // double precision ulp itself. In this case, we inflate the reported error + // by half an ulp to take this into account. A more correct and permanent + // fix would be to undertake refactoring the reference code to return + // results in this format: + // + // typedef struct DoubleReference + // { // true value = correctlyRoundedResult + ulps * + // ulp(correctlyRoundedResult) (infinitely precise) + // double correctlyRoundedResult; // as best we can + // double ulps; // plus a fractional amount to + // account for the difference + // }DoubleReference; // between infinitely + // precise result and correctlyRoundedResult, in units of ulps. + // + // This would provide a useful higher-than-double precision format for + // everyone that we can use, and would solve a few problems with + // representing absolute errors below DBL_MIN and over DBL_MAX for systems + // that use a head to tail double double for long double. int x; long double testVal = test; @@ -1660,119 +1769,118 @@ float Bruteforce_Ulp_Error_Double( double test, long double reference ) // First, handle special reference values if (isinf(reference)) { - if (reference == testVal) - return 0.0f; + if (reference == testVal) return 0.0f; - return INFINITY; + return INFINITY; } if (isnan(reference)) { - if (isnan(testVal)) - return 0.0f; + if (isnan(testVal)) return 0.0f; - return INFINITY; + return INFINITY; } - if ( 0.0L != reference && 0.5L != frexpl(reference, &x) ) + if (0.0L != reference && 0.5L != frexpl(reference, &x)) { // Non-zero and Non-power of two - // allow correctly rounded results to pass through unmolested. (We might add error to it below.) - // There is something of a performance optimization here. - if( testVal == reference ) - return 0.0f; + // allow correctly rounded results to pass through unmolested. (We might + // add error to it below.) There is something of a performance + // optimization here. + if (testVal == reference) return 0.0f; // The unbiased exponent of the ulp unit place - int ulp_exp = DBL_MANT_DIG - 1 - MAX( ilogbl( reference), DBL_MIN_EXP-1 ); + int ulp_exp = + DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1); // Scale the exponent of the error - float result = (float) scalbnl( testVal - reference, ulp_exp ); + float result = (float)scalbnl(testVal - reference, ulp_exp); - // account for rounding error in reference result on systems that do not have a higher precision floating point type (see above) - if( sizeof(long double) == sizeof( double ) ) - result += copysignf( 0.5f, result); + // account for rounding error in reference result on systems that do not + // have a higher precision floating point type (see above) + if (sizeof(long double) == sizeof(double)) + result += copysignf(0.5f, result); return result; } // reference is a normal power of two or a zero // The unbiased exponent of the ulp unit place - int ulp_exp = DBL_MANT_DIG - 1 - MAX( ilogbl( reference) - 1, DBL_MIN_EXP-1 ); + int ulp_exp = + DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1); - // allow correctly rounded results to pass through unmolested. (We might add error to it below.) - // There is something of a performance optimization here too. - if( testVal == reference ) - return 0.0f; + // allow correctly rounded results to pass through unmolested. (We might add + // error to it below.) There is something of a performance optimization here + // too. + if (testVal == reference) return 0.0f; // Scale the exponent of the error - float result = (float) scalbnl( testVal - reference, ulp_exp ); + float result = (float)scalbnl(testVal - reference, ulp_exp); - // account for rounding error in reference result on systems that do not have a higher precision floating point type (see above) - if( sizeof(long double) == sizeof( double ) ) - result += copysignf( 0.5f, result); + // account for rounding error in reference result on systems that do not + // have a higher precision floating point type (see above) + if (sizeof(long double) == sizeof(double)) + result += copysignf(0.5f, result); return result; } -float Abs_Error( float test, double reference ) +float Abs_Error(float test, double reference) { - if( isnan(test) && isnan(reference) ) - return 0.0f; - return fabs((float)(reference-(double)test)); + if (isnan(test) && isnan(reference)) return 0.0f; + return fabs((float)(reference - (double)test)); } -#if defined( __APPLE__ ) - #include +#if defined(__APPLE__) +#include #endif -uint64_t GetTime( void ) +uint64_t GetTime(void) { -#if defined( __APPLE__ ) +#if defined(__APPLE__) return mach_absolute_time(); #elif defined(_WIN32) && defined(_MSC_VER) - return ReadTime(); + return ReadTime(); #else - //mach_absolute_time is a high precision timer with precision < 1 microsecond. - #warning need accurate clock here. Times are invalid. +// mach_absolute_time is a high precision timer with precision < 1 microsecond. +#warning need accurate clock here. Times are invalid. return 0; #endif } -#if defined(_WIN32) && defined (_MSC_VER) +#if defined(_WIN32) && defined(_MSC_VER) /* function is defined in "compat.h" */ #else -double SubtractTime( uint64_t endTime, uint64_t startTime ) +double SubtractTime(uint64_t endTime, uint64_t startTime) { uint64_t diff = endTime - startTime; static double conversion = 0.0; - if( 0.0 == conversion ) + if (0.0 == conversion) { -#if defined( __APPLE__ ) - mach_timebase_info_data_t info = {0,0}; - kern_return_t err = mach_timebase_info( &info ); - if( 0 == err ) - conversion = 1e-9 * (double) info.numer / (double) info.denom; +#if defined(__APPLE__) + mach_timebase_info_data_t info = { 0, 0 }; + kern_return_t err = mach_timebase_info(&info); + if (0 == err) + conversion = 1e-9 * (double)info.numer / (double)info.denom; #else - // This function consumes output from GetTime() above, and converts the time to secionds. - #warning need accurate ticks to seconds conversion factor here. Times are invalid. +// This function consumes output from GetTime() above, and converts the time to +// secionds. +#warning need accurate ticks to seconds conversion factor here. Times are invalid. #endif } // strictly speaking we should also be subtracting out timer latency here - return conversion * (double) diff; + return conversion * (double)diff; } #endif -cl_uint RoundUpToNextPowerOfTwo( cl_uint x ) +cl_uint RoundUpToNextPowerOfTwo(cl_uint x) { - if( 0 == (x & (x-1))) - return x; + if (0 == (x & (x - 1))) return x; - while( x & (x-1) ) - x &= x-1; + while (x & (x - 1)) x &= x - 1; - return x+x; + return x + x; } - diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index 01c99c14..1a5a6690 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -23,46 +23,47 @@ #include "Utility.h" -#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) - #include +#if defined(__SSE__) \ + || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) +#include #endif -#if defined( __SSE2__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) - #include +#if defined(__SSE2__) \ + || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) +#include #endif #ifndef M_PI_4 - #define M_PI_4 (M_PI/4) +#define M_PI_4 (M_PI / 4) #endif -#define EVALUATE( x ) x -#define CONCATENATE(x, y) x ## EVALUATE(y) +#define EVALUATE(x) x +#define CONCATENATE(x, y) x##EVALUATE(y) #pragma STDC FP_CONTRACT OFF static void __log2_ep(double *hi, double *lo, double x); -typedef union -{ +typedef union { uint64_t i; double d; -}uint64d_t; +} uint64d_t; static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL }; #define cl_make_nan() _CL_NAN.d -static double reduce1( double x ); -static double reduce1( double x ) +static double reduce1(double x); +static double reduce1(double x) { - if( fabs(x) >= HEX_DBL( +, 1, 0, +, 53 ) ) + if (fabs(x) >= HEX_DBL(+, 1, 0, +, 53)) { - if( fabs(x) == INFINITY ) - return cl_make_nan(); + if (fabs(x) == INFINITY) return cl_make_nan(); - return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs + return 0.0; // we patch up the sign for sinPi and cosPi later, since + // they need different signs } // Find the nearest multiple of 2 - const double r = copysign( HEX_DBL( +, 1, 0, +, 53 ), x ); + const double r = copysign(HEX_DBL(+, 1, 0, +, 53), x); double z = x + r; z -= r; @@ -79,7 +80,8 @@ static double reduceHalf( double x ) if( fabs(x) == INFINITY ) return cl_make_nan(); - return 0.0; //we patch up the sign for sinPi and cosPi later, since they need different signs + return 0.0; //we patch up the sign for sinPi and cosPi later, since they +need different signs } // Find the nearest multiple of 1 @@ -92,362 +94,384 @@ static double reduceHalf( double x ) } */ -double reference_acospi( double x) { return reference_acos( x ) / M_PI; } -double reference_asinpi( double x) { return reference_asin( x ) / M_PI; } -double reference_atanpi( double x) { return reference_atan( x ) / M_PI; } -double reference_atan2pi( double y, double x ) { return reference_atan2( y, x) / M_PI; } -double reference_cospi( double x) +double reference_acospi(double x) { return reference_acos(x) / M_PI; } +double reference_asinpi(double x) { return reference_asin(x) / M_PI; } +double reference_atanpi(double x) { return reference_atan(x) / M_PI; } +double reference_atan2pi(double y, double x) { - if( reference_fabs(x) >= HEX_DBL( +, 1, 0, +, 52 ) ) + return reference_atan2(y, x) / M_PI; +} +double reference_cospi(double x) +{ + if (reference_fabs(x) >= HEX_DBL(+, 1, 0, +, 52)) { - if( reference_fabs(x) == INFINITY ) - return cl_make_nan(); + if (reference_fabs(x) == INFINITY) return cl_make_nan(); - //Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53. - //However, when starting with single precision inputs, there will be no odd values. + // Note this probably fails for odd values between 0x1.0p52 and + // 0x1.0p53. However, when starting with single precision inputs, there + // will be no odd values. return 1.0; } - x = reduce1(x+0.5); + x = reduce1(x + 0.5); // reduce to [-0.5, 0.5] - if( x < -0.5 ) + if (x < -0.5) x = -1 - x; - else if ( x > 0.5 ) + else if (x > 0.5) x = 1 - x; // cosPi zeros are all +0 - if( x == 0.0 ) - return 0.0; + if (x == 0.0) return 0.0; - return reference_sin( x * M_PI ); + return reference_sin(x * M_PI); } double reference_relaxed_cospi(double x) { return reference_cospi(x); } -double reference_relaxed_divide( double x, double y ) { return (float)(((float) x ) / ( (float) y )); } +double reference_relaxed_divide(double x, double y) +{ + return (float)(((float)x) / ((float)y)); +} -double reference_divide( double x, double y ) { return x / y; } +double reference_divide(double x, double y) { return x / y; } // Add a + b. If the result modulo overflowed, write 1 to *carry, otherwise 0 -static inline cl_ulong add_carry( cl_ulong a, cl_ulong b, cl_ulong *carry ) +static inline cl_ulong add_carry(cl_ulong a, cl_ulong b, cl_ulong *carry) { cl_ulong result = a + b; *carry = result < a; return result; } -// Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise 0 -static inline cl_ulong sub_carry( cl_ulong a, cl_ulong b, cl_ulong *carry ) +// Subtract a - b. If the result modulo overflowed, write 1 to *carry, otherwise +// 0 +static inline cl_ulong sub_carry(cl_ulong a, cl_ulong b, cl_ulong *carry) { cl_ulong result = a - b; *carry = result > a; return result; } -static float fallback_frexpf( float x, int *iptr ) +static float fallback_frexpf(float x, int *iptr) { cl_uint u, v; float fu, fv; - memcpy( &u, &x, sizeof(u)); + memcpy(&u, &x, sizeof(u)); - cl_uint exponent = u & 0x7f800000U; + cl_uint exponent = u & 0x7f800000U; cl_uint mantissa = u & ~0x7f800000U; // add 1 to the exponent exponent += 0x00800000U; - if( (cl_int) exponent < (cl_int) 0x01000000 ) + if ((cl_int)exponent < (cl_int)0x01000000) { // subnormal, NaN, Inf mantissa |= 0x3f000000U; v = mantissa & 0xff800000U; u = mantissa; - memcpy( &fv, &v, sizeof(v)); - memcpy( &fu, &u, sizeof(u)); + memcpy(&fv, &v, sizeof(v)); + memcpy(&fu, &u, sizeof(u)); fu -= fv; - memcpy( &v, &fv, sizeof(v)); - memcpy( &u, &fu, sizeof(u)); + memcpy(&v, &fv, sizeof(v)); + memcpy(&u, &fu, sizeof(u)); - exponent = u & 0x7f800000U; + exponent = u & 0x7f800000U; mantissa = u & ~0x7f800000U; - *iptr = (exponent >> 23) + (-126 + 1 -126); + *iptr = (exponent >> 23) + (-126 + 1 - 126); u = mantissa | 0x3f000000U; - memcpy( &fu, &u, sizeof(u)); + memcpy(&fu, &u, sizeof(u)); return fu; } *iptr = (exponent >> 23) - 127; u = mantissa | 0x3f000000U; - memcpy( &fu, &u, sizeof(u)); + memcpy(&fu, &u, sizeof(u)); return fu; } -static inline int extractf( float, cl_uint * ); -static inline int extractf( float x, cl_uint *mant ) +static inline int extractf(float, cl_uint *); +static inline int extractf(float x, cl_uint *mant) { - static float (*frexppf)(float, int*) = NULL; + static float (*frexppf)(float, int *) = NULL; int e; // verify that frexp works properly - if( NULL == frexppf ) + if (NULL == frexppf) { - if( 0.5f == frexpf( HEX_FLT( +, 1, 0, -, 130 ), &e ) && e == -129 ) + if (0.5f == frexpf(HEX_FLT(+, 1, 0, -, 130), &e) && e == -129) frexppf = frexpf; else frexppf = fallback_frexpf; } - *mant = (cl_uint) (HEX_FLT( +, 1, 0, +, 32 ) * fabsf( frexppf( x, &e ))); + *mant = (cl_uint)(HEX_FLT(+, 1, 0, +, 32) * fabsf(frexppf(x, &e))); return e - 1; } -// Shift right by shift bits. Any bits lost on the right side are bitwise OR'd together and ORd into the LSB of the result -static inline void shift_right_sticky_64( cl_ulong *p, int shift ); -static inline void shift_right_sticky_64( cl_ulong *p, int shift ) +// Shift right by shift bits. Any bits lost on the right side are bitwise OR'd +// together and ORd into the LSB of the result +static inline void shift_right_sticky_64(cl_ulong *p, int shift); +static inline void shift_right_sticky_64(cl_ulong *p, int shift) { cl_ulong sticky = 0; cl_ulong r = *p; // C doesn't handle shifts greater than the size of the variable dependably - if( shift >= 64 ) + if (shift >= 64) { sticky |= (0 != r); r = 0; } else { - sticky |= (0 != (r << (64-shift))); + sticky |= (0 != (r << (64 - shift))); r >>= shift; } *p = r | sticky; } -// Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd into the LSB of the result -static inline void add64( cl_ulong *p, cl_ulong c, int *exponent ); -static inline void add64( cl_ulong *p, cl_ulong c, int *exponent ) +// Add two 64 bit mantissas. Bits that are below the LSB of the result are OR'd +// into the LSB of the result +static inline void add64(cl_ulong *p, cl_ulong c, int *exponent); +static inline void add64(cl_ulong *p, cl_ulong c, int *exponent) { cl_ulong carry; c = add_carry(c, *p, &carry); - if( carry ) + if (carry) { - carry = c & 1; // set aside sticky bit - c >>= 1; // right shift to deal with overflow - c |= carry | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The latter is to prevent rounding from believing we are exact half way case - *exponent = *exponent + 1; // adjust exponent + carry = c & 1; // set aside sticky bit + c >>= 1; // right shift to deal with overflow + c |= carry + | 0x8000000000000000ULL; // or in carry bit, and sticky bit. The + // latter is to prevent rounding from + // believing we are exact half way case + *exponent = *exponent + 1; // adjust exponent } *p = c; } // IEEE-754 round to nearest, ties to even rounding -static float round_to_nearest_even_float( cl_ulong p, int exponent ); -static float round_to_nearest_even_float( cl_ulong p, int exponent ) +static float round_to_nearest_even_float(cl_ulong p, int exponent); +static float round_to_nearest_even_float(cl_ulong p, int exponent) { - union{ cl_uint u; cl_float d;} u; + union { + cl_uint u; + cl_float d; + } u; // If mantissa is zero, return 0.0f if (p == 0) return 0.0f; // edges - if( exponent > 127 ) + if (exponent > 127) { - volatile float r = exponent * CL_FLT_MAX; // signal overflow + volatile float r = exponent * CL_FLT_MAX; // signal overflow // attempt to fool the compiler into not optimizing the above line away - if( r > CL_FLT_MAX ) - return INFINITY; + if (r > CL_FLT_MAX) return INFINITY; return r; } - if( exponent == -150 && p > 0x8000000000000000ULL) - return HEX_FLT( +, 1, 0, -, 149 ); - if( exponent <= -150 ) return 0.0f; + if (exponent == -150 && p > 0x8000000000000000ULL) + return HEX_FLT(+, 1, 0, -, 149); + if (exponent <= -150) return 0.0f; - //Figure out which bits go where + // Figure out which bits go where int shift = 8 + 32; - if( exponent < -126 ) + if (exponent < -126) { - shift -= 126 + exponent; // subnormal: shift is not 52 - exponent = -127; // set exponent to 0 + shift -= 126 + exponent; // subnormal: shift is not 52 + exponent = -127; // set exponent to 0 } else - p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it. + p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove + // it. // Assemble the double (round toward zero) - u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23); + u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23); // put a representation of the residual bits into hi - p <<= (64-shift); + p <<= (64 - shift); - //round to nearest, ties to even based on the unused portion of p - if( p < 0x8000000000000000ULL ) return u.d; - if( p == 0x8000000000000000ULL ) u.u += u.u & 1U; - else u.u++; + // round to nearest, ties to even based on the unused portion of p + if (p < 0x8000000000000000ULL) return u.d; + if (p == 0x8000000000000000ULL) + u.u += u.u & 1U; + else + u.u++; return u.d; } -static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent ); -static float round_to_nearest_even_float_ftz( cl_ulong p, int exponent ) +static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent); +static float round_to_nearest_even_float_ftz(cl_ulong p, int exponent) { extern int gCheckTininessBeforeRounding; - union{ cl_uint u; cl_float d;} u; + union { + cl_uint u; + cl_float d; + } u; int shift = 8 + 32; // If mantissa is zero, return 0.0f if (p == 0) return 0.0f; // edges - if( exponent > 127 ) + if (exponent > 127) { - volatile float r = exponent * CL_FLT_MAX; // signal overflow + volatile float r = exponent * CL_FLT_MAX; // signal overflow // attempt to fool the compiler into not optimizing the above line away - if( r > CL_FLT_MAX ) - return INFINITY; + if (r > CL_FLT_MAX) return INFINITY; return r; } // Deal with FTZ for gCheckTininessBeforeRounding - if( exponent < (gCheckTininessBeforeRounding - 127) ) - return 0.0f; + if (exponent < (gCheckTininessBeforeRounding - 127)) return 0.0f; - if( exponent == -127 ) // only happens for machines that check tininess after rounding - p = (p&1) | (p>>1); + if (exponent + == -127) // only happens for machines that check tininess after rounding + p = (p & 1) | (p >> 1); else - p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it. + p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove + // it. cl_ulong q = p; // Assemble the double (round toward zero) - u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23); + u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23); // put a representation of the residual bits into hi - q <<= (64-shift); + q <<= (64 - shift); - //round to nearest, ties to even based on the unused portion of p - if( q > 0x8000000000000000ULL ) + // round to nearest, ties to even based on the unused portion of p + if (q > 0x8000000000000000ULL) u.u++; - else if( q == 0x8000000000000000ULL ) + else if (q == 0x8000000000000000ULL) u.u += u.u & 1U; // Deal with FTZ for ! gCheckTininessBeforeRounding - if( 0 == (u.u & 0x7f800000U ) ) - return 0.0f; + if (0 == (u.u & 0x7f800000U)) return 0.0f; return u.d; } // IEEE-754 round toward zero. -static float round_toward_zero_float( cl_ulong p, int exponent ); -static float round_toward_zero_float( cl_ulong p, int exponent ) +static float round_toward_zero_float(cl_ulong p, int exponent); +static float round_toward_zero_float(cl_ulong p, int exponent) { - union{ cl_uint u; cl_float d;} u; + union { + cl_uint u; + cl_float d; + } u; // If mantissa is zero, return 0.0f if (p == 0) return 0.0f; // edges - if( exponent > 127 ) + if (exponent > 127) { - volatile float r = exponent * CL_FLT_MAX; // signal overflow + volatile float r = exponent * CL_FLT_MAX; // signal overflow // attempt to fool the compiler into not optimizing the above line away - if( r > CL_FLT_MAX ) - return CL_FLT_MAX; + if (r > CL_FLT_MAX) return CL_FLT_MAX; return r; } - if( exponent <= -149 ) - return 0.0f; + if (exponent <= -149) return 0.0f; - //Figure out which bits go where + // Figure out which bits go where int shift = 8 + 32; - if( exponent < -126 ) + if (exponent < -126) { - shift -= 126 + exponent; // subnormal: shift is not 52 - exponent = -127; // set exponent to 0 + shift -= 126 + exponent; // subnormal: shift is not 52 + exponent = -127; // set exponent to 0 } else - p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it. + p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove + // it. // Assemble the double (round toward zero) - u.u = (cl_uint)(p >> shift) | ((cl_uint) (exponent + 127) << 23); + u.u = (cl_uint)(p >> shift) | ((cl_uint)(exponent + 127) << 23); return u.d; } -static float round_toward_zero_float_ftz( cl_ulong p, int exponent ); -static float round_toward_zero_float_ftz( cl_ulong p, int exponent ) +static float round_toward_zero_float_ftz(cl_ulong p, int exponent); +static float round_toward_zero_float_ftz(cl_ulong p, int exponent) { extern int gCheckTininessBeforeRounding; - union{ cl_uint u; cl_float d;} u; + union { + cl_uint u; + cl_float d; + } u; int shift = 8 + 32; // If mantissa is zero, return 0.0f if (p == 0) return 0.0f; // edges - if( exponent > 127 ) + if (exponent > 127) { - volatile float r = exponent * CL_FLT_MAX; // signal overflow + volatile float r = exponent * CL_FLT_MAX; // signal overflow // attempt to fool the compiler into not optimizing the above line away - if( r > CL_FLT_MAX ) - return CL_FLT_MAX; + if (r > CL_FLT_MAX) return CL_FLT_MAX; return r; } // Deal with FTZ for gCheckTininessBeforeRounding - if( exponent < -126 ) - return 0.0f; + if (exponent < -126) return 0.0f; - cl_ulong q = p &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it. + cl_ulong q = p &= + 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it. // Assemble the double (round toward zero) - u.u = (cl_uint)(q >> shift) | ((cl_uint) (exponent + 127) << 23); + u.u = (cl_uint)(q >> shift) | ((cl_uint)(exponent + 127) << 23); // put a representation of the residual bits into hi - q <<= (64-shift); + q <<= (64 - shift); return u.d; } // Subtract two significands. -static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC ); -static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC ) +static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC); +static inline void sub64(cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC) { cl_ulong carry; - p = sub_carry( *c, p, &carry ); + p = sub_carry(*c, p, &carry); - if( carry ) + if (carry) { *signC ^= 0x80000000U; p = -p; } // normalize - if( p ) + if (p) { int shift = 32; cl_ulong test = 1ULL << 32; - while( 0 == (p & 0x8000000000000000ULL)) + while (0 == (p & 0x8000000000000000ULL)) { - if( p < test ) + if (p < test) { p <<= shift; *expC = *expC - shift; @@ -460,49 +484,60 @@ static inline void sub64( cl_ulong *c, cl_ulong p, cl_uint *signC, int *expC ) { // zero result. *expC = -200; - *signC = 0; // IEEE rules say a - a = +0 for all rounding modes except -inf + *signC = + 0; // IEEE rules say a - a = +0 for all rounding modes except -inf } *c = p; } -float reference_fma( float a, float b, float c, int shouldFlush ) +float reference_fma(float a, float b, float c, int shouldFlush) { static const cl_uint kMSB = 0x80000000U; // Make bits accessible - union{ cl_uint u; cl_float d; } ua; ua.d = a; - union{ cl_uint u; cl_float d; } ub; ub.d = b; - union{ cl_uint u; cl_float d; } uc; uc.d = c; + union { + cl_uint u; + cl_float d; + } ua; + ua.d = a; + union { + cl_uint u; + cl_float d; + } ub; + ub.d = b; + union { + cl_uint u; + cl_float d; + } uc; + uc.d = c; // deal with Nans, infinities and zeros - if( isnan( a ) || isnan( b ) || isnan(c) || - isinf( a ) || isinf( b ) || isinf(c) || - 0 == ( ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior - 0 == ( ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior - 0 == ( uc.u & ~kMSB) ) // c == 0, defeat host FTZ behavior + if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c) + || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior + 0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior + 0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior { FPU_mode_type oldMode; RoundingMode oldRoundMode = kRoundToNearestEven; - if( isinf( c ) && !isinf(a) && !isinf(b) ) - return (c + a) + b; + if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b; - if (gIsInRTZMode) - oldRoundMode = set_round(kRoundTowardZero, kfloat); + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); - memset( &oldMode, 0, sizeof( oldMode ) ); - if( shouldFlush ) - ForceFTZ( &oldMode ); + memset(&oldMode, 0, sizeof(oldMode)); + if (shouldFlush) ForceFTZ(&oldMode); - a = (float) reference_multiply( a, b ); // some risk that the compiler will insert a non-compliant fma here on some platforms. - a = (float) reference_add( a, c ); // We use STDC FP_CONTRACT OFF above to attempt to defeat that. + a = (float)reference_multiply( + a, b); // some risk that the compiler will insert a non-compliant + // fma here on some platforms. + a = (float)reference_add( + a, + c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that. - if( shouldFlush ) - RestoreFPState( &oldMode ); + if (shouldFlush) RestoreFPState(&oldMode); - if( gIsInRTZMode ) - set_round(oldRoundMode, kfloat); + if (gIsInRTZMode) set_round(oldRoundMode, kfloat); return a; } @@ -510,67 +545,70 @@ float reference_fma( float a, float b, float c, int shouldFlush ) // exponent is a standard unbiased signed integer // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB cl_uint mantA, mantB, mantC; - int expA = extractf( a, &mantA ); - int expB = extractf( b, &mantB ); - int expC = extractf( c, &mantC ); - cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide if we are adding or subtracting + int expA = extractf(a, &mantA); + int expB = extractf(b, &mantB); + int expC = extractf(c, &mantC); + cl_uint signC = uc.u & kMSB; // We'll need the sign bit of C later to decide + // if we are adding or subtracting -// exact product of A and B + // exact product of A and B int exponent = expA + expB; cl_uint sign = (ua.u ^ ub.u) & kMSB; - cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB; + cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB; // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999.. - // The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication - if( 0 == (0x8000000000000000ULL & product) ) + // The MSB might not be set. If so, fix that. Otherwise, reflect the fact + // that we got another power of two from the multiplication + if (0 == (0x8000000000000000ULL & product)) product <<= 1; else - exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased. + exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our + // exponent increased. -//infinite precision add - cl_ulong addend = (cl_ulong) mantC << 32; - if( exponent >= expC ) + // infinite precision add + cl_ulong addend = (cl_ulong)mantC << 32; + if (exponent >= expC) { // Shift C relative to the product so that their exponents match - if( exponent > expC ) - shift_right_sticky_64( &addend, exponent - expC ); + if (exponent > expC) shift_right_sticky_64(&addend, exponent - expC); // Add - if( sign ^ signC ) - sub64( &product, addend, &sign, &exponent ); + if (sign ^ signC) + sub64(&product, addend, &sign, &exponent); else - add64( &product, addend, &exponent ); + add64(&product, addend, &exponent); } else { // Shift the product relative to C so that their exponents match - shift_right_sticky_64( &product, expC - exponent ); + shift_right_sticky_64(&product, expC - exponent); // add - if( sign ^ signC ) - sub64( &addend, product, &signC, &expC ); + if (sign ^ signC) + sub64(&addend, product, &signC, &expC); else - add64( &addend, product, &expC ); + add64(&addend, product, &expC); product = addend; exponent = expC; sign = signC; } - // round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c. + // round to IEEE result -- we do not do flushing to zero here. That part is + // handled manually in ternary.c. if (gIsInRTZMode) { - if( shouldFlush ) - ua.d = round_toward_zero_float_ftz( product, exponent); + if (shouldFlush) + ua.d = round_toward_zero_float_ftz(product, exponent); else - ua.d = round_toward_zero_float( product, exponent); + ua.d = round_toward_zero_float(product, exponent); } else { - if( shouldFlush ) - ua.d = round_to_nearest_even_float_ftz( product, exponent); + if (shouldFlush) + ua.d = round_to_nearest_even_float_ftz(product, exponent); else - ua.d = round_to_nearest_even_float( product, exponent); + ua.d = round_to_nearest_even_float(product, exponent); } // Set the sign @@ -579,35 +617,36 @@ float reference_fma( float a, float b, float c, int shouldFlush ) return ua.d; } -double reference_relaxed_exp10( double x) +double reference_relaxed_exp10(double x) { return reference_exp10(x); } + +double reference_exp10(double x) { - return reference_exp10(x); + return reference_exp2(x * HEX_DBL(+, 1, a934f0979a371, +, 1)); } -double reference_exp10( double x) { return reference_exp2( x * HEX_DBL( +, 1, a934f0979a371, +, 1 ) ); } - -int reference_ilogb( double x ) +int reference_ilogb(double x) { extern int gDeviceILogb0, gDeviceILogbNaN; - union { cl_double f; cl_ulong u;} u; + union { + cl_double f; + cl_ulong u; + } u; - u.f = (float) x; - cl_int exponent = (cl_int) (u.u >> 52) & 0x7ff; - if( exponent == 0x7ff ) + u.f = (float)x; + cl_int exponent = (cl_int)(u.u >> 52) & 0x7ff; + if (exponent == 0x7ff) { - if( u.u & 0x000fffffffffffffULL ) - return gDeviceILogbNaN; + if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN; return CL_INT_MAX; } - if( exponent == 0 ) - { // deal with denormals - u.f = x * HEX_DBL( +, 1, 0, +, 64 ); - exponent = (cl_int) (u.u >> 52) & 0x7ff; - if( exponent == 0 ) - return gDeviceILogb0; + if (exponent == 0) + { // deal with denormals + u.f = x * HEX_DBL(+, 1, 0, +, 64); + exponent = (cl_int)(u.u >> 52) & 0x7ff; + if (exponent == 0) return gDeviceILogb0; return exponent - (1023 + 64); } @@ -615,220 +654,208 @@ int reference_ilogb( double x ) return exponent - 1023; } -double reference_nan( cl_uint x ) +double reference_nan(cl_uint x) { - union{ cl_uint u; cl_float f; }u; + union { + cl_uint u; + cl_float f; + } u; u.u = x | 0x7fc00000U; - return (double) u.f; + return (double)u.f; } -double reference_maxmag( double x, double y ) +double reference_maxmag(double x, double y) { double fabsx = fabs(x); double fabsy = fabs(y); - if( fabsx < fabsy ) - return y; + if (fabsx < fabsy) return y; - if( fabsy < fabsx ) - return x; + if (fabsy < fabsx) return x; - return reference_fmax( x, y ); + return reference_fmax(x, y); } -double reference_minmag( double x, double y ) +double reference_minmag(double x, double y) { double fabsx = fabs(x); double fabsy = fabs(y); - if( fabsx > fabsy ) - return y; + if (fabsx > fabsy) return y; - if( fabsy > fabsx ) - return x; + if (fabsy > fabsx) return x; - return reference_fmin( x, y ); + return reference_fmin(x, y); } -//double my_nextafter( double x, double y ){ return (double) nextafterf( (float) x, (float) y ); } +// double my_nextafter( double x, double y ){ return (double) nextafterf( +// (float) x, (float) y ); } -double reference_relaxed_mad( double a, double b, double c) +double reference_relaxed_mad(double a, double b, double c) { - return ((float) a )* ((float) b) + (float) c; + return ((float)a) * ((float)b) + (float)c; } -double reference_mad( double a, double b, double c ) -{ - return a * b + c; -} +double reference_mad(double a, double b, double c) { return a * b + c; } -double reference_recip( double x) { return 1.0 / x; } -double reference_rootn( double x, int i ) +double reference_recip(double x) { return 1.0 / x; } +double reference_rootn(double x, int i) { - //rootn ( x, 0 ) returns a NaN. - if( 0 == i ) - return cl_make_nan(); + // rootn ( x, 0 ) returns a NaN. + if (0 == i) return cl_make_nan(); - //rootn ( x, n ) returns a NaN for x < 0 and n is even. - if( x < 0 && 0 == (i&1) ) - return cl_make_nan(); + // rootn ( x, n ) returns a NaN for x < 0 and n is even. + if (x < 0 && 0 == (i & 1)) return cl_make_nan(); - if( x == 0.0 ) + if (x == 0.0) { - switch( i & 0x80000001 ) + switch (i & 0x80000001) { - //rootn ( +-0, n ) is +0 for even n > 0. - case 0: - return 0.0f; + // rootn ( +-0, n ) is +0 for even n > 0. + case 0: return 0.0f; - //rootn ( +-0, n ) is +-0 for odd n > 0. - case 1: - return x; + // rootn ( +-0, n ) is +-0 for odd n > 0. + case 1: return x; - //rootn ( +-0, n ) is +inf for even n < 0. - case 0x80000000: - return INFINITY; + // rootn ( +-0, n ) is +inf for even n < 0. + case 0x80000000: return INFINITY; - //rootn ( +-0, n ) is +-inf for odd n < 0. - case 0x80000001: - return copysign(INFINITY, x); + // rootn ( +-0, n ) is +-inf for odd n < 0. + case 0x80000001: return copysign(INFINITY, x); } } double sign = x; x = reference_fabs(x); - x = reference_exp2( reference_log2(x) / (double) i ); - return reference_copysignd( x, sign ); + x = reference_exp2(reference_log2(x) / (double)i); + return reference_copysignd(x, sign); } -double reference_rsqrt( double x) { return 1.0 / reference_sqrt(x); } -//double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); } -double reference_sinpi( double x) +double reference_rsqrt(double x) { return 1.0 / reference_sqrt(x); } +// double reference_sincos( double x, double *c ){ *c = cos(x); return sin(x); } +double reference_sinpi(double x) { double r = reduce1(x); // reduce to [-0.5, 0.5] - if( r < -0.5 ) + if (r < -0.5) r = -1 - r; - else if ( r > 0.5 ) + else if (r > 0.5) r = 1 - r; // sinPi zeros have the same sign as x - if( r == 0.0 ) - return reference_copysignd(0.0, x); + if (r == 0.0) return reference_copysignd(0.0, x); - return reference_sin( r * M_PI ); + return reference_sin(r * M_PI); } double reference_relaxed_sinpi(double x) { return reference_sinpi(x); } -double reference_tanpi( double x) +double reference_tanpi(double x) { // set aside the sign (allows us to preserve sign of -0) - double sign = reference_copysignd( 1.0, x); + double sign = reference_copysignd(1.0, x); double z = reference_fabs(x); // if big and even -- caution: only works if x only has single precision - if( z >= HEX_DBL( +, 1, 0, +, 24 ) ) + if (z >= HEX_DBL(+, 1, 0, +, 24)) { - if( z == INFINITY ) - return x - x; // nan + if (z == INFINITY) return x - x; // nan - return reference_copysignd( 0.0, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n. + return reference_copysignd( + 0.0, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n. } // reduce to the range [ -0.5, 0.5 ] - double nearest = reference_rint( z ); // round to nearest even places n + 0.5 values in the right place for us - int i = (int) nearest; // test above against 0x1.0p24 avoids overflow here + double nearest = reference_rint(z); // round to nearest even places n + 0.5 + // values in the right place for us + int i = (int)nearest; // test above against 0x1.0p24 avoids overflow here z -= nearest; - //correction for odd integer x for the right sign of zero - if( (i&1) && z == 0.0 ) - sign = -sign; + // correction for odd integer x for the right sign of zero + if ((i & 1) && z == 0.0) sign = -sign; // track changes to the sign - sign *= reference_copysignd(1.0, z); // really should just be an xor - z = reference_fabs(z); // remove the sign again + sign *= reference_copysignd(1.0, z); // really should just be an xor + z = reference_fabs(z); // remove the sign again // reduce once more - // If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly - if( z > 0.25 ) + // If we don't do this, rounding error in z * M_PI will cause us not to + // return infinities properly + if (z > 0.25) { z = 0.5 - z; - return sign / reference_tan( z * M_PI ); // use system tan to get the right result + return sign + / reference_tan(z * M_PI); // use system tan to get the right result } // - return sign * reference_tan( z * M_PI ); // use system tan to get the right result + return sign + * reference_tan(z * M_PI); // use system tan to get the right result } -double reference_pown( double x, int i) { return reference_pow( x, (double) i ); } -double reference_powr( double x, double y ) +double reference_pown(double x, int i) { return reference_pow(x, (double)i); } +double reference_powr(double x, double y) { - //powr ( x, y ) returns NaN for x < 0. - if( x < 0.0 ) - return cl_make_nan(); + // powr ( x, y ) returns NaN for x < 0. + if (x < 0.0) return cl_make_nan(); - //powr ( x, NaN ) returns the NaN for x >= 0. - //powr ( NaN, y ) returns the NaN. - if( isnan(x) || isnan(y) ) - return x + y; // Note: behavior different here than for pow(1,NaN), pow(NaN, 0) + // powr ( x, NaN ) returns the NaN for x >= 0. + // powr ( NaN, y ) returns the NaN. + if (isnan(x) || isnan(y)) + return x + y; // Note: behavior different here than for pow(1,NaN), + // pow(NaN, 0) - if( x == 1.0 ) + if (x == 1.0) { - //powr ( +1, +-inf ) returns NaN. - if( reference_fabs(y) == INFINITY ) - return cl_make_nan(); + // powr ( +1, +-inf ) returns NaN. + if (reference_fabs(y) == INFINITY) return cl_make_nan(); - //powr ( +1, y ) is 1 for finite y. (NaN handled above) + // powr ( +1, y ) is 1 for finite y. (NaN handled above) return 1.0; } - if( y == 0.0 ) + if (y == 0.0) { - //powr ( +inf, +-0 ) returns NaN. - //powr ( +-0, +-0 ) returns NaN. - if( x == 0.0 || x == INFINITY ) - return cl_make_nan(); + // powr ( +inf, +-0 ) returns NaN. + // powr ( +-0, +-0 ) returns NaN. + if (x == 0.0 || x == INFINITY) return cl_make_nan(); - //powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already handled above) + // powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already + // handled above) return 1.0; } - if( x == 0.0 ) + if (x == 0.0) { - //powr ( +-0, -inf) is +inf. - //powr ( +-0, y ) is +inf for finite y < 0. - if( y < 0.0 ) - return INFINITY; + // powr ( +-0, -inf) is +inf. + // powr ( +-0, y ) is +inf for finite y < 0. + if (y < 0.0) return INFINITY; - //powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) + // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) return 0.0; } // x = +inf - if( isinf(x) ) + if (isinf(x)) { - if( y < 0 ) - return 0; + if (y < 0) return 0; return INFINITY; } double fabsx = reference_fabs(x); double fabsy = reference_fabs(y); - //y = +-inf cases - if( isinf(fabsy) ) + // y = +-inf cases + if (isinf(fabsy)) { - if( y < 0 ) + if (y < 0) { - if( fabsx < 1 ) - return INFINITY; + if (fabsx < 1) return INFINITY; return 0; } - if( fabsx < 1 ) - return 0; + if (fabsx < 1) return 0; return INFINITY; } @@ -840,169 +867,212 @@ double reference_powr( double x, double y ) return result; } -double reference_fract( double x, double *ip ) +double reference_fract(double x, double *ip) { - if(isnan(x)) { + if (isnan(x)) + { *ip = cl_make_nan(); return cl_make_nan(); } float i; - float f = modff((float) x, &i ); - if( f < 0.0 ) + float f = modff((float)x, &i); + if (f < 0.0) { f = 1.0f + f; i -= 1.0f; - if( f == 1.0f ) - f = HEX_FLT( +, 1, fffffe, -, 1 ); + if (f == 1.0f) f = HEX_FLT(+, 1, fffffe, -, 1); } *ip = i; return f; } -//double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); } -double reference_add( double x, double y ) +// double my_fdim( double x, double y){ return fdimf( (float) x, (float) y ); } +double reference_add(double x, double y) { - volatile float a = (float) x; - volatile float b = (float) y; + volatile float a = (float)x; + volatile float b = (float)y; -#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) +#if defined(__SSE__) \ + || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) // defeat x87 - __m128 va = _mm_set_ss( (float) a ); - __m128 vb = _mm_set_ss( (float) b ); - va = _mm_add_ss( va, vb ); - _mm_store_ss( (float*) &a, va ); + __m128 va = _mm_set_ss((float)a); + __m128 vb = _mm_set_ss((float)b); + va = _mm_add_ss(va, vb); + _mm_store_ss((float *)&a, va); #elif defined(__PPC__) - // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero. - // As such, the reference add with FTZ must be emulated in sw. - if (fpu_control & _FPU_MASK_NI) { - union{ cl_uint u; cl_float d; } ua; ua.d = a; - union{ cl_uint u; cl_float d; } ub; ub.d = b; - cl_uint mantA, mantB; - cl_ulong addendA, addendB, sum; - int expA = extractf( a, &mantA ); - int expB = extractf( b, &mantB ); - cl_uint signA = ua.u & 0x80000000U; - cl_uint signB = ub.u & 0x80000000U; + // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes + // denorm's to zero. As such, the reference add with FTZ must be emulated in + // sw. + if (fpu_control & _FPU_MASK_NI) + { + union { + cl_uint u; + cl_float d; + } ua; + ua.d = a; + union { + cl_uint u; + cl_float d; + } ub; + ub.d = b; + cl_uint mantA, mantB; + cl_ulong addendA, addendB, sum; + int expA = extractf(a, &mantA); + int expB = extractf(b, &mantB); + cl_uint signA = ua.u & 0x80000000U; + cl_uint signB = ub.u & 0x80000000U; - // Force matching exponents if an operand is 0 - if (a == 0.0f) { - expA = expB; - } else if (b == 0.0f) { - expB = expA; - } + // Force matching exponents if an operand is 0 + if (a == 0.0f) + { + expA = expB; + } + else if (b == 0.0f) + { + expB = expA; + } - addendA = (cl_ulong)mantA << 32; - addendB = (cl_ulong)mantB << 32; + addendA = (cl_ulong)mantA << 32; + addendB = (cl_ulong)mantB << 32; - if (expA >= expB) { - // Shift B relative to the A so that their exponents match - if( expA > expB ) - shift_right_sticky_64( &addendB, expA - expB ); + if (expA >= expB) + { + // Shift B relative to the A so that their exponents match + if (expA > expB) shift_right_sticky_64(&addendB, expA - expB); - // add - if( signA ^ signB ) - sub64( &addendA, addendB, &signA, &expA ); + // add + if (signA ^ signB) + sub64(&addendA, addendB, &signA, &expA); + else + add64(&addendA, addendB, &expA); + } else - add64( &addendA, addendB, &expA ); - } else { - // Shift the A relative to B so that their exponents match - shift_right_sticky_64( &addendA, expB - expA ); + { + // Shift the A relative to B so that their exponents match + shift_right_sticky_64(&addendA, expB - expA); - // add - if( signA ^ signB ) - sub64( &addendB, addendA, &signB, &expB ); + // add + if (signA ^ signB) + sub64(&addendB, addendA, &signB, &expB); + else + add64(&addendB, addendA, &expB); + + addendA = addendB; + expA = expB; + signA = signB; + } + + // round to IEEE result + if (gIsInRTZMode) + { + ua.d = round_toward_zero_float_ftz(addendA, expA); + } else - add64( &addendB, addendA, &expB ); - - addendA = addendB; - expA = expB; - signA = signB; - } - - // round to IEEE result - if (gIsInRTZMode) { - ua.d = round_toward_zero_float_ftz( addendA, expA ); - } else { - ua.d = round_to_nearest_even_float_ftz( addendA, expA ); - } - // Set the sign - ua.u |= signA; - a = ua.d; - } else { - a += b; + { + ua.d = round_to_nearest_even_float_ftz(addendA, expA); + } + // Set the sign + ua.u |= signA; + a = ua.d; + } + else + { + a += b; } #else a += b; #endif - return (double) a; - } + return (double)a; +} -double reference_subtract( double x, double y ) +double reference_subtract(double x, double y) { - volatile float a = (float) x; - volatile float b = (float) y; -#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) + volatile float a = (float)x; + volatile float b = (float)y; +#if defined(__SSE__) \ + || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) // defeat x87 - __m128 va = _mm_set_ss( (float) a ); - __m128 vb = _mm_set_ss( (float) b ); - va = _mm_sub_ss( va, vb ); - _mm_store_ss( (float*) &a, va ); + __m128 va = _mm_set_ss((float)a); + __m128 vb = _mm_set_ss((float)b); + va = _mm_sub_ss(va, vb); + _mm_store_ss((float *)&a, va); #else a -= b; #endif return a; } -//double reference_divide( double x, double y ){ return (float) x / (float) y; } -double reference_multiply( double x, double y) +// double reference_divide( double x, double y ){ return (float) x / (float) y; +// } +double reference_multiply(double x, double y) { - volatile float a = (float) x; - volatile float b = (float) y; -#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) + volatile float a = (float)x; + volatile float b = (float)y; +#if defined(__SSE__) \ + || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) // defeat x87 - __m128 va = _mm_set_ss( (float) a ); - __m128 vb = _mm_set_ss( (float) b ); - va = _mm_mul_ss( va, vb ); - _mm_store_ss( (float*) &a, va ); + __m128 va = _mm_set_ss((float)a); + __m128 vb = _mm_set_ss((float)b); + va = _mm_mul_ss(va, vb); + _mm_store_ss((float *)&a, va); #elif defined(__PPC__) - // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero. - // As such, the reference multiply with FTZ must be emulated in sw. - if (fpu_control & _FPU_MASK_NI) { - // extract exponent and mantissa - // exponent is a standard unbiased signed integer - // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB - union{ cl_uint u; cl_float d; } ua; ua.d = a; - union{ cl_uint u; cl_float d; } ub; ub.d = b; - cl_uint mantA, mantB; - int expA = extractf( a, &mantA ); - int expB = extractf( b, &mantB ); + // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes + // denorm's to zero. As such, the reference multiply with FTZ must be + // emulated in sw. + if (fpu_control & _FPU_MASK_NI) + { + // extract exponent and mantissa + // exponent is a standard unbiased signed integer + // mantissa is a cl_uint, with leading non-zero bit positioned at the + // MSB + union { + cl_uint u; + cl_float d; + } ua; + ua.d = a; + union { + cl_uint u; + cl_float d; + } ub; + ub.d = b; + cl_uint mantA, mantB; + int expA = extractf(a, &mantA); + int expB = extractf(b, &mantB); - // exact product of A and B - int exponent = expA + expB; - cl_uint sign = (ua.u ^ ub.u) & 0x80000000U; - cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB; + // exact product of A and B + int exponent = expA + expB; + cl_uint sign = (ua.u ^ ub.u) & 0x80000000U; + cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB; - // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999.. - // The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication - if( 0 == (0x8000000000000000ULL & product) ) - product <<= 1; - else - exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased. + // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999.. + // The MSB might not be set. If so, fix that. Otherwise, reflect the + // fact that we got another power of two from the multiplication + if (0 == (0x8000000000000000ULL & product)) + product <<= 1; + else + exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then + // our exponent increased. - // round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c. - if (gIsInRTZMode) { - ua.d = round_toward_zero_float_ftz( product, exponent); - } else { - ua.d = round_to_nearest_even_float_ftz( product, exponent); - } - // Set the sign - ua.u |= sign; - a = ua.d; - } else { - a *= b; + // round to IEEE result -- we do not do flushing to zero here. That part + // is handled manually in ternary.c. + if (gIsInRTZMode) + { + ua.d = round_toward_zero_float_ftz(product, exponent); + } + else + { + ua.d = round_to_nearest_even_float_ftz(product, exponent); + } + // Set the sign + ua.u |= sign; + a = ua.d; + } + else + { + a *= b; } #else a *= b; @@ -1022,7 +1092,7 @@ double reference_multiply( double x, double y) return (double) remquof( (float) x, (float) y, iptr ); }*/ -double reference_lgamma_r( double x, int *signp ) +double reference_lgamma_r(double x, int *signp) { // This is not currently tested *signp = 0; @@ -1030,81 +1100,93 @@ double reference_lgamma_r( double x, int *signp ) } -int reference_isequal( double x, double y ){ return x == y; } -int reference_isfinite( double x ){ return 0 != isfinite(x); } -int reference_isgreater( double x, double y ){ return x > y; } -int reference_isgreaterequal( double x, double y ){ return x >= y; } -int reference_isinf( double x ){ return 0 != isinf(x); } -int reference_isless( double x, double y ){ return x < y; } -int reference_islessequal( double x, double y ){ return x <= y; } -int reference_islessgreater( double x, double y ){ return 0 != islessgreater( x, y ); } -int reference_isnan( double x ){ return 0 != isnan( x ); } -int reference_isnormal( double x ){ return 0 != isnormal( (float) x ); } -int reference_isnotequal( double x, double y ){ return x != y; } -int reference_isordered( double x, double y){ return x == x && y == y; } -int reference_isunordered( double x, double y ){ return isnan(x) || isnan( y ); } -int reference_signbit( float x ){ return 0 != signbit( x ); } +int reference_isequal(double x, double y) { return x == y; } +int reference_isfinite(double x) { return 0 != isfinite(x); } +int reference_isgreater(double x, double y) { return x > y; } +int reference_isgreaterequal(double x, double y) { return x >= y; } +int reference_isinf(double x) { return 0 != isinf(x); } +int reference_isless(double x, double y) { return x < y; } +int reference_islessequal(double x, double y) { return x <= y; } +int reference_islessgreater(double x, double y) +{ + return 0 != islessgreater(x, y); +} +int reference_isnan(double x) { return 0 != isnan(x); } +int reference_isnormal(double x) { return 0 != isnormal((float)x); } +int reference_isnotequal(double x, double y) { return x != y; } +int reference_isordered(double x, double y) { return x == x && y == y; } +int reference_isunordered(double x, double y) { return isnan(x) || isnan(y); } +int reference_signbit(float x) { return 0 != signbit(x); } #if 1 // defined( _MSC_VER ) -//Missing functions for win32 +// Missing functions for win32 -float reference_copysign( float x, float y ) +float reference_copysign(float x, float y) { - union { float f; cl_uint u;} ux, uy; - ux.f = x; uy.f = y; + union { + float f; + cl_uint u; + } ux, uy; + ux.f = x; + uy.f = y; ux.u &= 0x7fffffffU; ux.u |= uy.u & 0x80000000U; return ux.f; } -double reference_copysignd( double x, double y ) +double reference_copysignd(double x, double y) { - union { double f; cl_ulong u;} ux, uy; - ux.f = x; uy.f = y; + union { + double f; + cl_ulong u; + } ux, uy; + ux.f = x; + uy.f = y; ux.u &= 0x7fffffffffffffffULL; ux.u |= uy.u & 0x8000000000000000ULL; return ux.f; } -double reference_round( double x ) +double reference_round(double x) { double absx = reference_fabs(x); - if( absx < 0.5 ) - return reference_copysignd( 0.0, x ); + if (absx < 0.5) return reference_copysignd(0.0, x); - if( absx < HEX_DBL( +, 1, 0, +, 53 ) ) - x = reference_trunc( x + reference_copysignd( 0.5, x ) ); + if (absx < HEX_DBL(+, 1, 0, +, 53)) + x = reference_trunc(x + reference_copysignd(0.5, x)); return x; } -double reference_trunc( double x ) +double reference_trunc(double x) { - if( fabs(x) < HEX_DBL( +, 1, 0, +, 53 ) ) + if (fabs(x) < HEX_DBL(+, 1, 0, +, 53)) { - cl_long l = (cl_long) x; + cl_long l = (cl_long)x; - return reference_copysignd( (double) l, x ); + return reference_copysignd((double)l, x); } return x; } #ifndef FP_ILOGB0 - #define FP_ILOGB0 INT_MIN +#define FP_ILOGB0 INT_MIN #endif #ifndef FP_ILOGBNAN - #define FP_ILOGBNAN INT_MAX +#define FP_ILOGBNAN INT_MAX #endif - -double reference_cbrt(double x){ return reference_copysignd( reference_pow( reference_fabs(x), 1.0/3.0 ), x ); } +double reference_cbrt(double x) +{ + return reference_copysignd(reference_pow(reference_fabs(x), 1.0 / 3.0), x); +} /* double reference_scalbn(double x, int i) @@ -1122,174 +1204,188 @@ double reference_scalbn(double x, int i) } */ -double reference_rint( double x ) +double reference_rint(double x) { - if( reference_fabs(x) < HEX_DBL( +, 1, 0, +, 52 ) ) + if (reference_fabs(x) < HEX_DBL(+, 1, 0, +, 52)) { - double magic = reference_copysignd( HEX_DBL( +, 1, 0, +, 52 ), x ); + double magic = reference_copysignd(HEX_DBL(+, 1, 0, +, 52), x); double rounded = (x + magic) - magic; - x = reference_copysignd( rounded, x ); + x = reference_copysignd(rounded, x); } return x; } -double reference_acosh( double x ) +double reference_acosh(double x) { // not full precision. Sufficient precision to cover float - if( isnan(x) ) - return x + x; + if (isnan(x)) return x + x; - if( x < 1.0 ) - return cl_make_nan(); + if (x < 1.0) return cl_make_nan(); - return reference_log( x + reference_sqrt(x + 1) * reference_sqrt(x-1) ); + return reference_log(x + reference_sqrt(x + 1) * reference_sqrt(x - 1)); } -double reference_asinh( double x ) +double reference_asinh(double x) { -/* - * ==================================================== - * This function is from fdlibm: http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - if( isnan(x) || isinf(x) ) - return x + x; + /* + * ==================================================== + * This function is from fdlibm: http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + if (isnan(x) || isinf(x)) return x + x; double absx = reference_fabs(x); - if( absx < HEX_DBL( +, 1, 0, -, 28 ) ) - return x; + if (absx < HEX_DBL(+, 1, 0, -, 28)) return x; double sign = reference_copysignd(1.0, x); - if( absx > HEX_DBL( +, 1, 0, +, 28 ) ) - return sign * (reference_log( absx ) + 0.693147180559945309417232121458176568); // log(2) + if (absx > HEX_DBL(+, 1, 0, +, 28)) + return sign + * (reference_log(absx) + + 0.693147180559945309417232121458176568); // log(2) - if( absx > 2.0 ) - return sign * reference_log( 2.0 * absx + 1.0 / (reference_sqrt( x * x + 1.0 ) + absx)); + if (absx > 2.0) + return sign + * reference_log(2.0 * absx + + 1.0 / (reference_sqrt(x * x + 1.0) + absx)); - return sign * reference_log1p( absx + x*x / (1.0 + reference_sqrt(1.0 + x*x))); + return sign + * reference_log1p(absx + x * x / (1.0 + reference_sqrt(1.0 + x * x))); } -double reference_atanh( double x ) +double reference_atanh(double x) { -/* - * ==================================================== - * This function is from fdlibm: http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - if( isnan(x) ) - return x + x; + /* + * ==================================================== + * This function is from fdlibm: http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + if (isnan(x)) return x + x; - double signed_half = reference_copysignd( 0.5, x ); + double signed_half = reference_copysignd(0.5, x); x = reference_fabs(x); - if( x > 1.0 ) - return cl_make_nan(); + if (x > 1.0) return cl_make_nan(); - if( x < 0.5 ) - return signed_half * reference_log1p( 2.0 * ( x + x*x / (1-x) ) ); + if (x < 0.5) + return signed_half * reference_log1p(2.0 * (x + x * x / (1 - x))); - return signed_half * reference_log1p(2.0 * x / (1-x)); + return signed_half * reference_log1p(2.0 * x / (1 - x)); } double reference_relaxed_atan(double x) { return reference_atan(x); } -double reference_relaxed_exp2( double x ) -{ - return reference_exp2(x); -} +double reference_relaxed_exp2(double x) { return reference_exp2(x); } -double reference_exp2( double x ) -{ // Note: only suitable for verifying single precision. Doesn't have range of a full double exp2 implementation. - if( x == 0.0 ) - return 1.0; +double reference_exp2(double x) +{ // Note: only suitable for verifying single precision. Doesn't have range of a + // full double exp2 implementation. + if (x == 0.0) return 1.0; // separate x into fractional and integer parts - double i = reference_rint( x ); // round to nearest integer + double i = reference_rint(x); // round to nearest integer - if( i < -150 ) - return 0.0; + if (i < -150) return 0.0; - if( i > 129 ) - return INFINITY; + if (i > 129) return INFINITY; - double f = x - i; // -0.5 <= f <= 0.5 + double f = x - i; // -0.5 <= f <= 0.5 // find exp2(f) // calculate as p(f) = (exp2(f)-1)/f // exp2(f) = f * p(f) + 1 // p(f) is a minimax polynomial with error within 0x1.c1fd80f0d1ab7p-50 - double p = 0.693147180560184539289 + - (0.240226506955902863183 + - (0.055504108656833424373 + - (0.009618129212846484796 + - (0.001333355902958566035 + - (0.000154034191902497930 + - (0.000015252317761038105 + - (0.000001326283129417092 + 0.000000102593187638680 * f)*f)*f)*f)*f)*f)*f)*f; + double p = 0.693147180560184539289 + + (0.240226506955902863183 + + (0.055504108656833424373 + + (0.009618129212846484796 + + (0.001333355902958566035 + + (0.000154034191902497930 + + (0.000015252317761038105 + + (0.000001326283129417092 + + 0.000000102593187638680 * f) + * f) + * f) + * f) + * f) + * f) + * f) + * f; f *= p; f += 1.0; // scale by 2 ** i - union{ cl_ulong u; double d; } u; - int exponent = (int) i + 1023; - u.u = (cl_ulong) exponent << 52; + union { + cl_ulong u; + double d; + } u; + int exponent = (int)i + 1023; + u.u = (cl_ulong)exponent << 52; return f * u.d; } -double reference_expm1( double x ) -{ // Note: only suitable for verifying single precision. Doesn't have range of a full double expm1 implementation. It is only accurate to 47 bits or less. +double reference_expm1(double x) +{ // Note: only suitable for verifying single precision. Doesn't have range of a + // full double expm1 implementation. It is only accurate to 47 bits or less. // early out for small numbers and NaNs - if( ! (reference_fabs(x) > HEX_DBL( +, 1, 0, -, 24 )) ) - return x; + if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 24))) return x; // early out for large negative numbers - if( x < -130.0 ) - return -1.0; + if (x < -130.0) return -1.0; // early out for large positive numbers - if( x > 100.0 ) - return INFINITY; + if (x > 100.0) return INFINITY; // separate x into fractional and integer parts - double i = reference_rint( x ); // round to nearest integer - double f = x - i; // -0.5 <= f <= 0.5 + double i = reference_rint(x); // round to nearest integer + double f = x - i; // -0.5 <= f <= 0.5 // reduce f to the range -0.0625 .. f.. 0.0625 - int index = (int) (f * 16.0) + 8; // 0...16 + int index = (int)(f * 16.0) + 8; // 0...16 - static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625, - 0.0, - +0.0625, +0.125, +0.1875, +0.25, +0.3125, +0.375, +0.4375, +0.5 }; + static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125, + -0.25, -0.1875, -0.125, -0.0625, + 0.0, +0.0625, +0.125, +0.1875, + +0.25, +0.3125, +0.375, +0.4375, + +0.5 }; // exponentials[i] = expm1(reduction[i]) - static const double exponentials[17] = { HEX_DBL( -, 1, 92e9a0720d3ec, -, 2 ), HEX_DBL( -, 1, 6adb1cd9205ee, -, 2 ), - HEX_DBL( -, 1, 40373d42ce2e3, -, 2 ), HEX_DBL( -, 1, 12d35a41ba104, -, 2 ), - HEX_DBL( -, 1, c5041854df7d4, -, 3 ), HEX_DBL( -, 1, 5e25fb4fde211, -, 3 ), - HEX_DBL( -, 1, e14aed893eef4, -, 4 ), HEX_DBL( -, 1, f0540438fd5c3, -, 5 ), - HEX_DBL( +, 0, 0, +, 0 ), - HEX_DBL( +, 1, 082b577d34ed8, -, 4 ), HEX_DBL( +, 1, 10b022db7ae68, -, 3 ), - HEX_DBL( +, 1, a65c0b85ac1a9, -, 3 ), HEX_DBL( +, 1, 22d78f0fa061a, -, 2 ), - HEX_DBL( +, 1, 77a45d8117fd5, -, 2 ), HEX_DBL( +, 1, d1e944f6fbdaa, -, 2 ), - HEX_DBL( +, 1, 190048ef6002, -, 1 ), HEX_DBL( +, 1, 4c2531c3c0d38, -, 1 ), - }; + static const double exponentials[17] = { + HEX_DBL(-, 1, 92e9a0720d3ec, -, 2), + HEX_DBL(-, 1, 6adb1cd9205ee, -, 2), + HEX_DBL(-, 1, 40373d42ce2e3, -, 2), + HEX_DBL(-, 1, 12d35a41ba104, -, 2), + HEX_DBL(-, 1, c5041854df7d4, -, 3), + HEX_DBL(-, 1, 5e25fb4fde211, -, 3), + HEX_DBL(-, 1, e14aed893eef4, -, 4), + HEX_DBL(-, 1, f0540438fd5c3, -, 5), + HEX_DBL(+, 0, 0, +, 0), + HEX_DBL(+, 1, 082b577d34ed8, -, 4), + HEX_DBL(+, 1, 10b022db7ae68, -, 3), + HEX_DBL(+, 1, a65c0b85ac1a9, -, 3), + HEX_DBL(+, 1, 22d78f0fa061a, -, 2), + HEX_DBL(+, 1, 77a45d8117fd5, -, 2), + HEX_DBL(+, 1, d1e944f6fbdaa, -, 2), + HEX_DBL(+, 1, 190048ef6002, -, 1), + HEX_DBL(+, 1, 4c2531c3c0d38, -, 1), + }; f -= reduction[index]; @@ -1297,223 +1393,368 @@ double reference_expm1( double x ) // find expm1(f) // calculate as p(f) = (exp(f)-1)/f // expm1(f) = f * p(f) - // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over the range +- 0.0625 - double p = 0.999999999999998001599 + - (0.499999999999839628284 + - (0.166666666672817459505 + - (0.041666666612283048687 + - (0.008333330214567431435 + - (0.001389005319303770070 + 0.000198833381525156667 * f)*f)*f)*f)*f)*f; + // p(f) is a minimax polynomial with error within 0x1.1d7693618d001p-48 over + // the range +- 0.0625 + double p = 0.999999999999998001599 + + (0.499999999999839628284 + + (0.166666666672817459505 + + (0.041666666612283048687 + + (0.008333330214567431435 + + (0.001389005319303770070 + 0.000198833381525156667 * f) + * f) + * f) + * f) + * f) + * f; f *= p; // expm1( reduced f ) // expm1(f) = (exmp1( reduced_f) + 1.0) * ( exponentials[index] + 1 ) - 1 - // = exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index] + 1 -1 - // = exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + exponentials[index] - f += exponentials[index] + f * exponentials[index]; + // = exmp1( reduced_f) * exponentials[index] + exmp1( reduced_f) + + // exponentials[index] + 1 -1 = exmp1( reduced_f) * + // exponentials[index] + exmp1( reduced_f) + exponentials[index] + f += exponentials[index] + f * exponentials[index]; // scale by e ** i - int exponent = (int) i; - if( 0 == exponent ) - return f; // precise answer for x near 1 + int exponent = (int)i; + if (0 == exponent) return f; // precise answer for x near 1 // table of e**(i-150) - static const double exp_table[128+150+1] = - { - HEX_DBL( +, 1, 82e16284f5ec5, -, 217 ), HEX_DBL( +, 1, 06e9996332ba1, -, 215 ), - HEX_DBL( +, 1, 6555cb289e44b, -, 214 ), HEX_DBL( +, 1, e5ab364643354, -, 213 ), - HEX_DBL( +, 1, 4a0bd18e64df7, -, 211 ), HEX_DBL( +, 1, c094499cc578e, -, 210 ), - HEX_DBL( +, 1, 30d759323998c, -, 208 ), HEX_DBL( +, 1, 9e5278ab1d4cf, -, 207 ), - HEX_DBL( +, 1, 198fa3f30be25, -, 205 ), HEX_DBL( +, 1, 7eae636d6144e, -, 204 ), - HEX_DBL( +, 1, 040f1036f4863, -, 202 ), HEX_DBL( +, 1, 6174e477a895f, -, 201 ), - HEX_DBL( +, 1, e065b82dd95a, -, 200 ), HEX_DBL( +, 1, 4676be491d129, -, 198 ), - HEX_DBL( +, 1, bbb5da5f7c823, -, 197 ), HEX_DBL( +, 1, 2d884eef5fdcb, -, 195 ), - HEX_DBL( +, 1, 99d3397ab8371, -, 194 ), HEX_DBL( +, 1, 1681497ed15b3, -, 192 ), - HEX_DBL( +, 1, 7a870f597fdbd, -, 191 ), HEX_DBL( +, 1, 013c74edba307, -, 189 ), - HEX_DBL( +, 1, 5d9ec4ada7938, -, 188 ), HEX_DBL( +, 1, db2edfd20fa7c, -, 187 ), - HEX_DBL( +, 1, 42eb9f39afb0b, -, 185 ), HEX_DBL( +, 1, b6e4f282b43f4, -, 184 ), - HEX_DBL( +, 1, 2a42764857b19, -, 182 ), HEX_DBL( +, 1, 9560792d19314, -, 181 ), - HEX_DBL( +, 1, 137b6ce8e052c, -, 179 ), HEX_DBL( +, 1, 766b45dd84f18, -, 178 ), - HEX_DBL( +, 1, fce362fe6e7d, -, 177 ), HEX_DBL( +, 1, 59d34dd8a5473, -, 175 ), - HEX_DBL( +, 1, d606847fc727a, -, 174 ), HEX_DBL( +, 1, 3f6a58b795de3, -, 172 ), - HEX_DBL( +, 1, b2216c6efdac1, -, 171 ), HEX_DBL( +, 1, 2705b5b153fb8, -, 169 ), - HEX_DBL( +, 1, 90fa1509bd50d, -, 168 ), HEX_DBL( +, 1, 107df698da211, -, 166 ), - HEX_DBL( +, 1, 725ae6e7b9d35, -, 165 ), HEX_DBL( +, 1, f75d6040aeff6, -, 164 ), - HEX_DBL( +, 1, 56126259e093c, -, 162 ), HEX_DBL( +, 1, d0ec7df4f7bd4, -, 161 ), - HEX_DBL( +, 1, 3bf2cf6722e46, -, 159 ), HEX_DBL( +, 1, ad6b22f55db42, -, 158 ), - HEX_DBL( +, 1, 23d1f3e5834a, -, 156 ), HEX_DBL( +, 1, 8c9feab89b876, -, 155 ), - HEX_DBL( +, 1, 0d88cf37f00dd, -, 153 ), HEX_DBL( +, 1, 6e55d2bf838a7, -, 152 ), - HEX_DBL( +, 1, f1e6b68529e33, -, 151 ), HEX_DBL( +, 1, 525be4e4e601d, -, 149 ), - HEX_DBL( +, 1, cbe0a45f75eb1, -, 148 ), HEX_DBL( +, 1, 3884e838aea68, -, 146 ), - HEX_DBL( +, 1, a8c1f14e2af5d, -, 145 ), HEX_DBL( +, 1, 20a717e64a9bd, -, 143 ), - HEX_DBL( +, 1, 8851d84118908, -, 142 ), HEX_DBL( +, 1, 0a9bdfb02d24, -, 140 ), - HEX_DBL( +, 1, 6a5bea046b42e, -, 139 ), HEX_DBL( +, 1, ec7f3b269efa8, -, 138 ), - HEX_DBL( +, 1, 4eafb87eab0f2, -, 136 ), HEX_DBL( +, 1, c6e2d05bbc, -, 135 ), - HEX_DBL( +, 1, 35208867c2683, -, 133 ), HEX_DBL( +, 1, a425b317eeacd, -, 132 ), - HEX_DBL( +, 1, 1d8508fa8246a, -, 130 ), HEX_DBL( +, 1, 840fbc08fdc8a, -, 129 ), - HEX_DBL( +, 1, 07b7112bc1ffe, -, 127 ), HEX_DBL( +, 1, 666d0dad2961d, -, 126 ), - HEX_DBL( +, 1, e726c3f64d0fe, -, 125 ), HEX_DBL( +, 1, 4b0dc07cabf98, -, 123 ), - HEX_DBL( +, 1, c1f2daf3b6a46, -, 122 ), HEX_DBL( +, 1, 31c5957a47de2, -, 120 ), - HEX_DBL( +, 1, 9f96445648b9f, -, 119 ), HEX_DBL( +, 1, 1a6baeadb4fd1, -, 117 ), - HEX_DBL( +, 1, 7fd974d372e45, -, 116 ), HEX_DBL( +, 1, 04da4d1452919, -, 114 ), - HEX_DBL( +, 1, 62891f06b345, -, 113 ), HEX_DBL( +, 1, e1dd273aa8a4a, -, 112 ), - HEX_DBL( +, 1, 4775e0840bfdd, -, 110 ), HEX_DBL( +, 1, bd109d9d94bda, -, 109 ), - HEX_DBL( +, 1, 2e73f53fba844, -, 107 ), HEX_DBL( +, 1, 9b138170d6bfe, -, 106 ), - HEX_DBL( +, 1, 175af0cf60ec5, -, 104 ), HEX_DBL( +, 1, 7baee1bffa80b, -, 103 ), - HEX_DBL( +, 1, 02057d1245ceb, -, 101 ), HEX_DBL( +, 1, 5eafffb34ba31, -, 100 ), - HEX_DBL( +, 1, dca23bae16424, -, 99 ), HEX_DBL( +, 1, 43e7fc88b8056, -, 97 ), - HEX_DBL( +, 1, b83bf23a9a9eb, -, 96 ), HEX_DBL( +, 1, 2b2b8dd05b318, -, 94 ), - HEX_DBL( +, 1, 969d47321e4cc, -, 93 ), HEX_DBL( +, 1, 1452b7723aed2, -, 91 ), - HEX_DBL( +, 1, 778fe2497184c, -, 90 ), HEX_DBL( +, 1, fe7116182e9cc, -, 89 ), - HEX_DBL( +, 1, 5ae191a99585a, -, 87 ), HEX_DBL( +, 1, d775d87da854d, -, 86 ), - HEX_DBL( +, 1, 4063f8cc8bb98, -, 84 ), HEX_DBL( +, 1, b374b315f87c1, -, 83 ), - HEX_DBL( +, 1, 27ec458c65e3c, -, 81 ), HEX_DBL( +, 1, 923372c67a074, -, 80 ), - HEX_DBL( +, 1, 1152eaeb73c08, -, 78 ), HEX_DBL( +, 1, 737c5645114b5, -, 77 ), - HEX_DBL( +, 1, f8e6c24b5592e, -, 76 ), HEX_DBL( +, 1, 571db733a9d61, -, 74 ), - HEX_DBL( +, 1, d257d547e083f, -, 73 ), HEX_DBL( +, 1, 3ce9b9de78f85, -, 71 ), - HEX_DBL( +, 1, aebabae3a41b5, -, 70 ), HEX_DBL( +, 1, 24b6031b49bda, -, 68 ), - HEX_DBL( +, 1, 8dd5e1bb09d7e, -, 67 ), HEX_DBL( +, 1, 0e5b73d1ff53d, -, 65 ), - HEX_DBL( +, 1, 6f741de1748ec, -, 64 ), HEX_DBL( +, 1, f36bd37f42f3e, -, 63 ), - HEX_DBL( +, 1, 536452ee2f75c, -, 61 ), HEX_DBL( +, 1, cd480a1b7482, -, 60 ), - HEX_DBL( +, 1, 39792499b1a24, -, 58 ), HEX_DBL( +, 1, aa0de4bf35b38, -, 57 ), - HEX_DBL( +, 1, 2188ad6ae3303, -, 55 ), HEX_DBL( +, 1, 898471fca6055, -, 54 ), - HEX_DBL( +, 1, 0b6c3afdde064, -, 52 ), HEX_DBL( +, 1, 6b7719a59f0e, -, 51 ), - HEX_DBL( +, 1, ee001eed62aa, -, 50 ), HEX_DBL( +, 1, 4fb547c775da8, -, 48 ), - HEX_DBL( +, 1, c8464f7616468, -, 47 ), HEX_DBL( +, 1, 36121e24d3bba, -, 45 ), - HEX_DBL( +, 1, a56e0c2ac7f75, -, 44 ), HEX_DBL( +, 1, 1e642baeb84a, -, 42 ), - HEX_DBL( +, 1, 853f01d6d53ba, -, 41 ), HEX_DBL( +, 1, 0885298767e9a, -, 39 ), - HEX_DBL( +, 1, 67852a7007e42, -, 38 ), HEX_DBL( +, 1, e8a37a45fc32e, -, 37 ), - HEX_DBL( +, 1, 4c1078fe9228a, -, 35 ), HEX_DBL( +, 1, c3527e433fab1, -, 34 ), - HEX_DBL( +, 1, 32b48bf117da2, -, 32 ), HEX_DBL( +, 1, a0db0d0ddb3ec, -, 31 ), - HEX_DBL( +, 1, 1b48655f37267, -, 29 ), HEX_DBL( +, 1, 81056ff2c5772, -, 28 ), - HEX_DBL( +, 1, 05a628c699fa1, -, 26 ), HEX_DBL( +, 1, 639e3175a689d, -, 25 ), - HEX_DBL( +, 1, e355bbaee85cb, -, 24 ), HEX_DBL( +, 1, 4875ca227ec38, -, 22 ), - HEX_DBL( +, 1, be6c6fdb01612, -, 21 ), HEX_DBL( +, 1, 2f6053b981d98, -, 19 ), - HEX_DBL( +, 1, 9c54c3b43bc8b, -, 18 ), HEX_DBL( +, 1, 18354238f6764, -, 16 ), - HEX_DBL( +, 1, 7cd79b5647c9b, -, 15 ), HEX_DBL( +, 1, 02cf22526545a, -, 13 ), - HEX_DBL( +, 1, 5fc21041027ad, -, 12 ), HEX_DBL( +, 1, de16b9c24a98f, -, 11 ), - HEX_DBL( +, 1, 44e51f113d4d6, -, 9 ), HEX_DBL( +, 1, b993fe00d5376, -, 8 ), - HEX_DBL( +, 1, 2c155b8213cf4, -, 6 ), HEX_DBL( +, 1, 97db0ccceb0af, -, 5 ), - HEX_DBL( +, 1, 152aaa3bf81cc, -, 3 ), HEX_DBL( +, 1, 78b56362cef38, -, 2 ), - HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 1, 5bf0a8b145769, +, 1 ), - HEX_DBL( +, 1, d8e64b8d4ddae, +, 2 ), HEX_DBL( +, 1, 415e5bf6fb106, +, 4 ), - HEX_DBL( +, 1, b4c902e273a58, +, 5 ), HEX_DBL( +, 1, 28d389970338f, +, 7 ), - HEX_DBL( +, 1, 936dc5690c08f, +, 8 ), HEX_DBL( +, 1, 122885aaeddaa, +, 10 ), - HEX_DBL( +, 1, 749ea7d470c6e, +, 11 ), HEX_DBL( +, 1, fa7157c470f82, +, 12 ), - HEX_DBL( +, 1, 5829dcf95056, +, 14 ), HEX_DBL( +, 1, d3c4488ee4f7f, +, 15 ), - HEX_DBL( +, 1, 3de1654d37c9a, +, 17 ), HEX_DBL( +, 1, b00b5916ac955, +, 18 ), - HEX_DBL( +, 1, 259ac48bf05d7, +, 20 ), HEX_DBL( +, 1, 8f0ccafad2a87, +, 21 ), - HEX_DBL( +, 1, 0f2ebd0a8002, +, 23 ), HEX_DBL( +, 1, 709348c0ea4f9, +, 24 ), - HEX_DBL( +, 1, f4f22091940bd, +, 25 ), HEX_DBL( +, 1, 546d8f9ed26e1, +, 27 ), - HEX_DBL( +, 1, ceb088b68e804, +, 28 ), HEX_DBL( +, 1, 3a6e1fd9eecfd, +, 30 ), - HEX_DBL( +, 1, ab5adb9c436, +, 31 ), HEX_DBL( +, 1, 226af33b1fdc1, +, 33 ), - HEX_DBL( +, 1, 8ab7fb5475fb7, +, 34 ), HEX_DBL( +, 1, 0c3d3920962c9, +, 36 ), - HEX_DBL( +, 1, 6c932696a6b5d, +, 37 ), HEX_DBL( +, 1, ef822f7f6731d, +, 38 ), - HEX_DBL( +, 1, 50bba3796379a, +, 40 ), HEX_DBL( +, 1, c9aae4631c056, +, 41 ), - HEX_DBL( +, 1, 370470aec28ed, +, 43 ), HEX_DBL( +, 1, a6b765d8cdf6d, +, 44 ), - HEX_DBL( +, 1, 1f43fcc4b662c, +, 46 ), HEX_DBL( +, 1, 866f34a725782, +, 47 ), - HEX_DBL( +, 1, 0953e2f3a1ef7, +, 49 ), HEX_DBL( +, 1, 689e221bc8d5b, +, 50 ), - HEX_DBL( +, 1, ea215a1d20d76, +, 51 ), HEX_DBL( +, 1, 4d13fbb1a001a, +, 53 ), - HEX_DBL( +, 1, c4b334617cc67, +, 54 ), HEX_DBL( +, 1, 33a43d282a519, +, 56 ), - HEX_DBL( +, 1, a220d397972eb, +, 57 ), HEX_DBL( +, 1, 1c25c88df6862, +, 59 ), - HEX_DBL( +, 1, 8232558201159, +, 60 ), HEX_DBL( +, 1, 0672a3c9eb871, +, 62 ), - HEX_DBL( +, 1, 64b41c6d37832, +, 63 ), HEX_DBL( +, 1, e4cf766fe49be, +, 64 ), - HEX_DBL( +, 1, 49767bc0483e3, +, 66 ), HEX_DBL( +, 1, bfc951eb8bb76, +, 67 ), - HEX_DBL( +, 1, 304d6aeca254b, +, 69 ), HEX_DBL( +, 1, 9d97010884251, +, 70 ), - HEX_DBL( +, 1, 19103e4080b45, +, 72 ), HEX_DBL( +, 1, 7e013cd114461, +, 73 ), - HEX_DBL( +, 1, 03996528e074c, +, 75 ), HEX_DBL( +, 1, 60d4f6fdac731, +, 76 ), - HEX_DBL( +, 1, df8c5af17ba3b, +, 77 ), HEX_DBL( +, 1, 45e3076d61699, +, 79 ), - HEX_DBL( +, 1, baed16a6e0da7, +, 80 ), HEX_DBL( +, 1, 2cffdfebde1a1, +, 82 ), - HEX_DBL( +, 1, 9919cabefcb69, +, 83 ), HEX_DBL( +, 1, 160345c9953e3, +, 85 ), - HEX_DBL( +, 1, 79dbc9dc53c66, +, 86 ), HEX_DBL( +, 1, 00c810d464097, +, 88 ), - HEX_DBL( +, 1, 5d009394c5c27, +, 89 ), HEX_DBL( +, 1, da57de8f107a8, +, 90 ), - HEX_DBL( +, 1, 425982cf597cd, +, 92 ), HEX_DBL( +, 1, b61e5ca3a5e31, +, 93 ), - HEX_DBL( +, 1, 29bb825dfcf87, +, 95 ), HEX_DBL( +, 1, 94a90db0d6fe2, +, 96 ), - HEX_DBL( +, 1, 12fec759586fd, +, 98 ), HEX_DBL( +, 1, 75c1dc469e3af, +, 99 ), - HEX_DBL( +, 1, fbfd219c43b04, +, 100 ), HEX_DBL( +, 1, 5936d44e1a146, +, 102 ), - HEX_DBL( +, 1, d531d8a7ee79c, +, 103 ), HEX_DBL( +, 1, 3ed9d24a2d51b, +, 105 ), - HEX_DBL( +, 1, b15cfe5b6e17b, +, 106 ), HEX_DBL( +, 1, 268038c2c0e, +, 108 ), - HEX_DBL( +, 1, 9044a73545d48, +, 109 ), HEX_DBL( +, 1, 1002ab6218b38, +, 111 ), - HEX_DBL( +, 1, 71b3540cbf921, +, 112 ), HEX_DBL( +, 1, f6799ea9c414a, +, 113 ), - HEX_DBL( +, 1, 55779b984f3eb, +, 115 ), HEX_DBL( +, 1, d01a210c44aa4, +, 116 ), - HEX_DBL( +, 1, 3b63da8e9121, +, 118 ), HEX_DBL( +, 1, aca8d6b0116b8, +, 119 ), - HEX_DBL( +, 1, 234de9e0c74e9, +, 121 ), HEX_DBL( +, 1, 8bec7503ca477, +, 122 ), - HEX_DBL( +, 1, 0d0eda9796b9, +, 124 ), HEX_DBL( +, 1, 6db0118477245, +, 125 ), - HEX_DBL( +, 1, f1056dc7bf22d, +, 126 ), HEX_DBL( +, 1, 51c2cc3433801, +, 128 ), - HEX_DBL( +, 1, cb108ffbec164, +, 129 ), HEX_DBL( +, 1, 37f780991b584, +, 131 ), - HEX_DBL( +, 1, a801c0ea8ac4d, +, 132 ), HEX_DBL( +, 1, 20247cc4c46c1, +, 134 ), - HEX_DBL( +, 1, 87a0553328015, +, 135 ), HEX_DBL( +, 1, 0a233dee4f9bb, +, 137 ), - HEX_DBL( +, 1, 69b7f55b808ba, +, 138 ), HEX_DBL( +, 1, eba064644060a, +, 139 ), - HEX_DBL( +, 1, 4e184933d9364, +, 141 ), HEX_DBL( +, 1, c614fe2531841, +, 142 ), - HEX_DBL( +, 1, 3494a9b171bf5, +, 144 ), HEX_DBL( +, 1, a36798b9d969b, +, 145 ), - HEX_DBL( +, 1, 1d03d8c0c04af, +, 147 ), HEX_DBL( +, 1, 836026385c974, +, 148 ), - HEX_DBL( +, 1, 073fbe9ac901d, +, 150 ), HEX_DBL( +, 1, 65cae0969f286, +, 151 ), - HEX_DBL( +, 1, e64a58639cae8, +, 152 ), HEX_DBL( +, 1, 4a77f5f9b50f9, +, 154 ), - HEX_DBL( +, 1, c12744a3a28e3, +, 155 ), HEX_DBL( +, 1, 313b3b6978e85, +, 157 ), - HEX_DBL( +, 1, 9eda3a31e587e, +, 158 ), HEX_DBL( +, 1, 19ebe56b56453, +, 160 ), - HEX_DBL( +, 1, 7f2bc6e599b7e, +, 161 ), HEX_DBL( +, 1, 04644610df2ff, +, 163 ), - HEX_DBL( +, 1, 61e8b490ac4e6, +, 164 ), HEX_DBL( +, 1, e103201f299b3, +, 165 ), - HEX_DBL( +, 1, 46e1b637beaf5, +, 167 ), HEX_DBL( +, 1, bc473cfede104, +, 168 ), - HEX_DBL( +, 1, 2deb1b9c85e2d, +, 170 ), HEX_DBL( +, 1, 9a5981ca67d1, +, 171 ), - HEX_DBL( +, 1, 16dc8a9ef670b, +, 173 ), HEX_DBL( +, 1, 7b03166942309, +, 174 ), - HEX_DBL( +, 1, 0190be03150a7, +, 176 ), HEX_DBL( +, 1, 5e1152f9a8119, +, 177 ), - HEX_DBL( +, 1, dbca9263f8487, +, 178 ), HEX_DBL( +, 1, 43556dee93bee, +, 180 ), - HEX_DBL( +, 1, b774c12967dfa, +, 181 ), HEX_DBL( +, 1, 2aa4306e922c2, +, 183 ), - HEX_DBL( +, 1, 95e54c5dd4217, +, 184 ) }; + static const double exp_table[128 + 150 + 1] = { + HEX_DBL(+, 1, 82e16284f5ec5, -, 217), + HEX_DBL(+, 1, 06e9996332ba1, -, 215), + HEX_DBL(+, 1, 6555cb289e44b, -, 214), + HEX_DBL(+, 1, e5ab364643354, -, 213), + HEX_DBL(+, 1, 4a0bd18e64df7, -, 211), + HEX_DBL(+, 1, c094499cc578e, -, 210), + HEX_DBL(+, 1, 30d759323998c, -, 208), + HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207), + HEX_DBL(+, 1, 198fa3f30be25, -, 205), + HEX_DBL(+, 1, 7eae636d6144e, -, 204), + HEX_DBL(+, 1, 040f1036f4863, -, 202), + HEX_DBL(+, 1, 6174e477a895f, -, 201), + HEX_DBL(+, 1, e065b82dd95a, -, 200), + HEX_DBL(+, 1, 4676be491d129, -, 198), + HEX_DBL(+, 1, bbb5da5f7c823, -, 197), + HEX_DBL(+, 1, 2d884eef5fdcb, -, 195), + HEX_DBL(+, 1, 99d3397ab8371, -, 194), + HEX_DBL(+, 1, 1681497ed15b3, -, 192), + HEX_DBL(+, 1, 7a870f597fdbd, -, 191), + HEX_DBL(+, 1, 013c74edba307, -, 189), + HEX_DBL(+, 1, 5d9ec4ada7938, -, 188), + HEX_DBL(+, 1, db2edfd20fa7c, -, 187), + HEX_DBL(+, 1, 42eb9f39afb0b, -, 185), + HEX_DBL(+, 1, b6e4f282b43f4, -, 184), + HEX_DBL(+, 1, 2a42764857b19, -, 182), + HEX_DBL(+, 1, 9560792d19314, -, 181), + HEX_DBL(+, 1, 137b6ce8e052c, -, 179), + HEX_DBL(+, 1, 766b45dd84f18, -, 178), + HEX_DBL(+, 1, fce362fe6e7d, -, 177), + HEX_DBL(+, 1, 59d34dd8a5473, -, 175), + HEX_DBL(+, 1, d606847fc727a, -, 174), + HEX_DBL(+, 1, 3f6a58b795de3, -, 172), + HEX_DBL(+, 1, b2216c6efdac1, -, 171), + HEX_DBL(+, 1, 2705b5b153fb8, -, 169), + HEX_DBL(+, 1, 90fa1509bd50d, -, 168), + HEX_DBL(+, 1, 107df698da211, -, 166), + HEX_DBL(+, 1, 725ae6e7b9d35, -, 165), + HEX_DBL(+, 1, f75d6040aeff6, -, 164), + HEX_DBL(+, 1, 56126259e093c, -, 162), + HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161), + HEX_DBL(+, 1, 3bf2cf6722e46, -, 159), + HEX_DBL(+, 1, ad6b22f55db42, -, 158), + HEX_DBL(+, 1, 23d1f3e5834a, -, 156), + HEX_DBL(+, 1, 8c9feab89b876, -, 155), + HEX_DBL(+, 1, 0d88cf37f00dd, -, 153), + HEX_DBL(+, 1, 6e55d2bf838a7, -, 152), + HEX_DBL(+, 1, f1e6b68529e33, -, 151), + HEX_DBL(+, 1, 525be4e4e601d, -, 149), + HEX_DBL(+, 1, cbe0a45f75eb1, -, 148), + HEX_DBL(+, 1, 3884e838aea68, -, 146), + HEX_DBL(+, 1, a8c1f14e2af5d, -, 145), + HEX_DBL(+, 1, 20a717e64a9bd, -, 143), + HEX_DBL(+, 1, 8851d84118908, -, 142), + HEX_DBL(+, 1, 0a9bdfb02d24, -, 140), + HEX_DBL(+, 1, 6a5bea046b42e, -, 139), + HEX_DBL(+, 1, ec7f3b269efa8, -, 138), + HEX_DBL(+, 1, 4eafb87eab0f2, -, 136), + HEX_DBL(+, 1, c6e2d05bbc, -, 135), + HEX_DBL(+, 1, 35208867c2683, -, 133), + HEX_DBL(+, 1, a425b317eeacd, -, 132), + HEX_DBL(+, 1, 1d8508fa8246a, -, 130), + HEX_DBL(+, 1, 840fbc08fdc8a, -, 129), + HEX_DBL(+, 1, 07b7112bc1ffe, -, 127), + HEX_DBL(+, 1, 666d0dad2961d, -, 126), + HEX_DBL(+, 1, e726c3f64d0fe, -, 125), + HEX_DBL(+, 1, 4b0dc07cabf98, -, 123), + HEX_DBL(+, 1, c1f2daf3b6a46, -, 122), + HEX_DBL(+, 1, 31c5957a47de2, -, 120), + HEX_DBL(+, 1, 9f96445648b9f, -, 119), + HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117), + HEX_DBL(+, 1, 7fd974d372e45, -, 116), + HEX_DBL(+, 1, 04da4d1452919, -, 114), + HEX_DBL(+, 1, 62891f06b345, -, 113), + HEX_DBL(+, 1, e1dd273aa8a4a, -, 112), + HEX_DBL(+, 1, 4775e0840bfdd, -, 110), + HEX_DBL(+, 1, bd109d9d94bda, -, 109), + HEX_DBL(+, 1, 2e73f53fba844, -, 107), + HEX_DBL(+, 1, 9b138170d6bfe, -, 106), + HEX_DBL(+, 1, 175af0cf60ec5, -, 104), + HEX_DBL(+, 1, 7baee1bffa80b, -, 103), + HEX_DBL(+, 1, 02057d1245ceb, -, 101), + HEX_DBL(+, 1, 5eafffb34ba31, -, 100), + HEX_DBL(+, 1, dca23bae16424, -, 99), + HEX_DBL(+, 1, 43e7fc88b8056, -, 97), + HEX_DBL(+, 1, b83bf23a9a9eb, -, 96), + HEX_DBL(+, 1, 2b2b8dd05b318, -, 94), + HEX_DBL(+, 1, 969d47321e4cc, -, 93), + HEX_DBL(+, 1, 1452b7723aed2, -, 91), + HEX_DBL(+, 1, 778fe2497184c, -, 90), + HEX_DBL(+, 1, fe7116182e9cc, -, 89), + HEX_DBL(+, 1, 5ae191a99585a, -, 87), + HEX_DBL(+, 1, d775d87da854d, -, 86), + HEX_DBL(+, 1, 4063f8cc8bb98, -, 84), + HEX_DBL(+, 1, b374b315f87c1, -, 83), + HEX_DBL(+, 1, 27ec458c65e3c, -, 81), + HEX_DBL(+, 1, 923372c67a074, -, 80), + HEX_DBL(+, 1, 1152eaeb73c08, -, 78), + HEX_DBL(+, 1, 737c5645114b5, -, 77), + HEX_DBL(+, 1, f8e6c24b5592e, -, 76), + HEX_DBL(+, 1, 571db733a9d61, -, 74), + HEX_DBL(+, 1, d257d547e083f, -, 73), + HEX_DBL(+, 1, 3ce9b9de78f85, -, 71), + HEX_DBL(+, 1, aebabae3a41b5, -, 70), + HEX_DBL(+, 1, 24b6031b49bda, -, 68), + HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67), + HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65), + HEX_DBL(+, 1, 6f741de1748ec, -, 64), + HEX_DBL(+, 1, f36bd37f42f3e, -, 63), + HEX_DBL(+, 1, 536452ee2f75c, -, 61), + HEX_DBL(+, 1, cd480a1b7482, -, 60), + HEX_DBL(+, 1, 39792499b1a24, -, 58), + HEX_DBL(+, 1, aa0de4bf35b38, -, 57), + HEX_DBL(+, 1, 2188ad6ae3303, -, 55), + HEX_DBL(+, 1, 898471fca6055, -, 54), + HEX_DBL(+, 1, 0b6c3afdde064, -, 52), + HEX_DBL(+, 1, 6b7719a59f0e, -, 51), + HEX_DBL(+, 1, ee001eed62aa, -, 50), + HEX_DBL(+, 1, 4fb547c775da8, -, 48), + HEX_DBL(+, 1, c8464f7616468, -, 47), + HEX_DBL(+, 1, 36121e24d3bba, -, 45), + HEX_DBL(+, 1, a56e0c2ac7f75, -, 44), + HEX_DBL(+, 1, 1e642baeb84a, -, 42), + HEX_DBL(+, 1, 853f01d6d53ba, -, 41), + HEX_DBL(+, 1, 0885298767e9a, -, 39), + HEX_DBL(+, 1, 67852a7007e42, -, 38), + HEX_DBL(+, 1, e8a37a45fc32e, -, 37), + HEX_DBL(+, 1, 4c1078fe9228a, -, 35), + HEX_DBL(+, 1, c3527e433fab1, -, 34), + HEX_DBL(+, 1, 32b48bf117da2, -, 32), + HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31), + HEX_DBL(+, 1, 1b48655f37267, -, 29), + HEX_DBL(+, 1, 81056ff2c5772, -, 28), + HEX_DBL(+, 1, 05a628c699fa1, -, 26), + HEX_DBL(+, 1, 639e3175a689d, -, 25), + HEX_DBL(+, 1, e355bbaee85cb, -, 24), + HEX_DBL(+, 1, 4875ca227ec38, -, 22), + HEX_DBL(+, 1, be6c6fdb01612, -, 21), + HEX_DBL(+, 1, 2f6053b981d98, -, 19), + HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18), + HEX_DBL(+, 1, 18354238f6764, -, 16), + HEX_DBL(+, 1, 7cd79b5647c9b, -, 15), + HEX_DBL(+, 1, 02cf22526545a, -, 13), + HEX_DBL(+, 1, 5fc21041027ad, -, 12), + HEX_DBL(+, 1, de16b9c24a98f, -, 11), + HEX_DBL(+, 1, 44e51f113d4d6, -, 9), + HEX_DBL(+, 1, b993fe00d5376, -, 8), + HEX_DBL(+, 1, 2c155b8213cf4, -, 6), + HEX_DBL(+, 1, 97db0ccceb0af, -, 5), + HEX_DBL(+, 1, 152aaa3bf81cc, -, 3), + HEX_DBL(+, 1, 78b56362cef38, -, 2), + HEX_DBL(+, 1, 0, +, 0), + HEX_DBL(+, 1, 5bf0a8b145769, +, 1), + HEX_DBL(+, 1, d8e64b8d4ddae, +, 2), + HEX_DBL(+, 1, 415e5bf6fb106, +, 4), + HEX_DBL(+, 1, b4c902e273a58, +, 5), + HEX_DBL(+, 1, 28d389970338f, +, 7), + HEX_DBL(+, 1, 936dc5690c08f, +, 8), + HEX_DBL(+, 1, 122885aaeddaa, +, 10), + HEX_DBL(+, 1, 749ea7d470c6e, +, 11), + HEX_DBL(+, 1, fa7157c470f82, +, 12), + HEX_DBL(+, 1, 5829dcf95056, +, 14), + HEX_DBL(+, 1, d3c4488ee4f7f, +, 15), + HEX_DBL(+, 1, 3de1654d37c9a, +, 17), + HEX_DBL(+, 1, b00b5916ac955, +, 18), + HEX_DBL(+, 1, 259ac48bf05d7, +, 20), + HEX_DBL(+, 1, 8f0ccafad2a87, +, 21), + HEX_DBL(+, 1, 0f2ebd0a8002, +, 23), + HEX_DBL(+, 1, 709348c0ea4f9, +, 24), + HEX_DBL(+, 1, f4f22091940bd, +, 25), + HEX_DBL(+, 1, 546d8f9ed26e1, +, 27), + HEX_DBL(+, 1, ceb088b68e804, +, 28), + HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30), + HEX_DBL(+, 1, ab5adb9c436, +, 31), + HEX_DBL(+, 1, 226af33b1fdc1, +, 33), + HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34), + HEX_DBL(+, 1, 0c3d3920962c9, +, 36), + HEX_DBL(+, 1, 6c932696a6b5d, +, 37), + HEX_DBL(+, 1, ef822f7f6731d, +, 38), + HEX_DBL(+, 1, 50bba3796379a, +, 40), + HEX_DBL(+, 1, c9aae4631c056, +, 41), + HEX_DBL(+, 1, 370470aec28ed, +, 43), + HEX_DBL(+, 1, a6b765d8cdf6d, +, 44), + HEX_DBL(+, 1, 1f43fcc4b662c, +, 46), + HEX_DBL(+, 1, 866f34a725782, +, 47), + HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49), + HEX_DBL(+, 1, 689e221bc8d5b, +, 50), + HEX_DBL(+, 1, ea215a1d20d76, +, 51), + HEX_DBL(+, 1, 4d13fbb1a001a, +, 53), + HEX_DBL(+, 1, c4b334617cc67, +, 54), + HEX_DBL(+, 1, 33a43d282a519, +, 56), + HEX_DBL(+, 1, a220d397972eb, +, 57), + HEX_DBL(+, 1, 1c25c88df6862, +, 59), + HEX_DBL(+, 1, 8232558201159, +, 60), + HEX_DBL(+, 1, 0672a3c9eb871, +, 62), + HEX_DBL(+, 1, 64b41c6d37832, +, 63), + HEX_DBL(+, 1, e4cf766fe49be, +, 64), + HEX_DBL(+, 1, 49767bc0483e3, +, 66), + HEX_DBL(+, 1, bfc951eb8bb76, +, 67), + HEX_DBL(+, 1, 304d6aeca254b, +, 69), + HEX_DBL(+, 1, 9d97010884251, +, 70), + HEX_DBL(+, 1, 19103e4080b45, +, 72), + HEX_DBL(+, 1, 7e013cd114461, +, 73), + HEX_DBL(+, 1, 03996528e074c, +, 75), + HEX_DBL(+, 1, 60d4f6fdac731, +, 76), + HEX_DBL(+, 1, df8c5af17ba3b, +, 77), + HEX_DBL(+, 1, 45e3076d61699, +, 79), + HEX_DBL(+, 1, baed16a6e0da7, +, 80), + HEX_DBL(+, 1, 2cffdfebde1a1, +, 82), + HEX_DBL(+, 1, 9919cabefcb69, +, 83), + HEX_DBL(+, 1, 160345c9953e3, +, 85), + HEX_DBL(+, 1, 79dbc9dc53c66, +, 86), + HEX_DBL(+, 1, 00c810d464097, +, 88), + HEX_DBL(+, 1, 5d009394c5c27, +, 89), + HEX_DBL(+, 1, da57de8f107a8, +, 90), + HEX_DBL(+, 1, 425982cf597cd, +, 92), + HEX_DBL(+, 1, b61e5ca3a5e31, +, 93), + HEX_DBL(+, 1, 29bb825dfcf87, +, 95), + HEX_DBL(+, 1, 94a90db0d6fe2, +, 96), + HEX_DBL(+, 1, 12fec759586fd, +, 98), + HEX_DBL(+, 1, 75c1dc469e3af, +, 99), + HEX_DBL(+, 1, fbfd219c43b04, +, 100), + HEX_DBL(+, 1, 5936d44e1a146, +, 102), + HEX_DBL(+, 1, d531d8a7ee79c, +, 103), + HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105), + HEX_DBL(+, 1, b15cfe5b6e17b, +, 106), + HEX_DBL(+, 1, 268038c2c0e, +, 108), + HEX_DBL(+, 1, 9044a73545d48, +, 109), + HEX_DBL(+, 1, 1002ab6218b38, +, 111), + HEX_DBL(+, 1, 71b3540cbf921, +, 112), + HEX_DBL(+, 1, f6799ea9c414a, +, 113), + HEX_DBL(+, 1, 55779b984f3eb, +, 115), + HEX_DBL(+, 1, d01a210c44aa4, +, 116), + HEX_DBL(+, 1, 3b63da8e9121, +, 118), + HEX_DBL(+, 1, aca8d6b0116b8, +, 119), + HEX_DBL(+, 1, 234de9e0c74e9, +, 121), + HEX_DBL(+, 1, 8bec7503ca477, +, 122), + HEX_DBL(+, 1, 0d0eda9796b9, +, 124), + HEX_DBL(+, 1, 6db0118477245, +, 125), + HEX_DBL(+, 1, f1056dc7bf22d, +, 126), + HEX_DBL(+, 1, 51c2cc3433801, +, 128), + HEX_DBL(+, 1, cb108ffbec164, +, 129), + HEX_DBL(+, 1, 37f780991b584, +, 131), + HEX_DBL(+, 1, a801c0ea8ac4d, +, 132), + HEX_DBL(+, 1, 20247cc4c46c1, +, 134), + HEX_DBL(+, 1, 87a0553328015, +, 135), + HEX_DBL(+, 1, 0a233dee4f9bb, +, 137), + HEX_DBL(+, 1, 69b7f55b808ba, +, 138), + HEX_DBL(+, 1, eba064644060a, +, 139), + HEX_DBL(+, 1, 4e184933d9364, +, 141), + HEX_DBL(+, 1, c614fe2531841, +, 142), + HEX_DBL(+, 1, 3494a9b171bf5, +, 144), + HEX_DBL(+, 1, a36798b9d969b, +, 145), + HEX_DBL(+, 1, 1d03d8c0c04af, +, 147), + HEX_DBL(+, 1, 836026385c974, +, 148), + HEX_DBL(+, 1, 073fbe9ac901d, +, 150), + HEX_DBL(+, 1, 65cae0969f286, +, 151), + HEX_DBL(+, 1, e64a58639cae8, +, 152), + HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154), + HEX_DBL(+, 1, c12744a3a28e3, +, 155), + HEX_DBL(+, 1, 313b3b6978e85, +, 157), + HEX_DBL(+, 1, 9eda3a31e587e, +, 158), + HEX_DBL(+, 1, 19ebe56b56453, +, 160), + HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161), + HEX_DBL(+, 1, 04644610df2ff, +, 163), + HEX_DBL(+, 1, 61e8b490ac4e6, +, 164), + HEX_DBL(+, 1, e103201f299b3, +, 165), + HEX_DBL(+, 1, 46e1b637beaf5, +, 167), + HEX_DBL(+, 1, bc473cfede104, +, 168), + HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170), + HEX_DBL(+, 1, 9a5981ca67d1, +, 171), + HEX_DBL(+, 1, 16dc8a9ef670b, +, 173), + HEX_DBL(+, 1, 7b03166942309, +, 174), + HEX_DBL(+, 1, 0190be03150a7, +, 176), + HEX_DBL(+, 1, 5e1152f9a8119, +, 177), + HEX_DBL(+, 1, dbca9263f8487, +, 178), + HEX_DBL(+, 1, 43556dee93bee, +, 180), + HEX_DBL(+, 1, b774c12967dfa, +, 181), + HEX_DBL(+, 1, 2aa4306e922c2, +, 183), + HEX_DBL(+, 1, 95e54c5dd4217, +, 184) + }; - // scale by e**i -- (expm1(f) + 1)*e**i - 1 = expm1(f) * e**i + e**i - 1 = e**i - return exp_table[exponent+150] + (f * exp_table[exponent+150] - 1.0); + // scale by e**i -- (expm1(f) + 1)*e**i - 1 = expm1(f) * e**i + e**i - 1 = + // e**i + return exp_table[exponent + 150] + (f * exp_table[exponent + 150] - 1.0); } -double reference_fmax( double x, double y ) +double reference_fmax(double x, double y) { - if( isnan(y) ) - return x; + if (isnan(y)) return x; return x >= y ? x : y; } -double reference_fmin( double x, double y ) +double reference_fmin(double x, double y) { - if( isnan(y) ) - return x; + if (isnan(y)) return x; return x <= y ? x : y; } -double reference_hypot( double x, double y ) +double reference_hypot(double x, double y) { - // Since the inputs are actually floats, we don't have to worry about range here - if( isinf(x) || isinf(y) ) - return INFINITY; + // Since the inputs are actually floats, we don't have to worry about range + // here + if (isinf(x) || isinf(y)) return INFINITY; - return sqrt( x * x + y * y ); + return sqrt(x * x + y * y); } -int reference_ilogbl( long double x) +int reference_ilogbl(long double x) { extern int gDeviceILogb0, gDeviceILogbNaN; // Since we are just using this to verify double precision, we can // use the double precision ilogb here - union { double f; cl_ulong u;} u; - u.f = (double) x; + union { + double f; + cl_ulong u; + } u; + u.f = (double)x; int exponent = (int)(u.u >> 52) & 0x7ff; - if( exponent == 0x7ff ) + if (exponent == 0x7ff) { - if( u.u & 0x000fffffffffffffULL ) - return gDeviceILogbNaN; + if (u.u & 0x000fffffffffffffULL) return gDeviceILogbNaN; return CL_INT_MAX; } - if( exponent == 0 ) - { // deal with denormals - u.f = x * HEX_DBL( +, 1, 0, +, 64 ); + if (exponent == 0) + { // deal with denormals + u.f = x * HEX_DBL(+, 1, 0, +, 64); exponent = (cl_uint)(u.u >> 52) & 0x7ff; - if( exponent == 0 ) - return gDeviceILogb0; + if (exponent == 0) return gDeviceILogb0; exponent -= 1023 + 64; return exponent; @@ -1522,84 +1763,111 @@ int reference_ilogbl( long double x) return exponent - 1023; } -//double reference_log2( double x ) +// double reference_log2( double x ) //{ // return log( x ) * 1.44269504088896340735992468100189214; //} -double reference_relaxed_log2( double x ) +double reference_relaxed_log2(double x) { return reference_log2(x); } + +double reference_log2(double x) { - return reference_log2(x); -} + if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan(); -double reference_log2( double x ) -{ - if( isnan(x) || x < 0.0 || x == -INFINITY) - return cl_make_nan(); + if (x == 0.0f) return -INFINITY; - if( x == 0.0f) - return -INFINITY; - - if( x == INFINITY ) - return INFINITY; + if (x == INFINITY) return INFINITY; double hi, lo; - __log2_ep( &hi, &lo, x ); + __log2_ep(&hi, &lo, x); return hi; } -double reference_log1p( double x ) -{ // This function is suitable only for verifying log1pf(). It produces several double precision ulps of error. +double reference_log1p(double x) +{ // This function is suitable only for verifying log1pf(). It produces several + // double precision ulps of error. // Handle small and NaN - if( ! ( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 53 ) ) ) - return x; + if (!(reference_fabs(x) > HEX_DBL(+, 1, 0, -, 53))) return x; // deal with special values - if( x <= -1.0 ) + if (x <= -1.0) { - if( x < -1.0 ) - return cl_make_nan(); + if (x < -1.0) return cl_make_nan(); return -INFINITY; } // infinity - if( x == INFINITY ) - return INFINITY; + if (x == INFINITY) return INFINITY; - // High precision result for when near 0, to avoid problems with the reference result falling in the wrong binade. - if( reference_fabs(x) < HEX_DBL( +, 1, 0, -, 28 ) ) - return (1.0 - 0.5 * x) * x; + // High precision result for when near 0, to avoid problems with the + // reference result falling in the wrong binade. + if (reference_fabs(x) < HEX_DBL(+, 1, 0, -, 28)) return (1.0 - 0.5 * x) * x; // Our polynomial is only good in the region +-2**-4. // If we aren't in that range then we need to reduce to be in that range - double correctionLo = -0.0; // correction down stream to compensate for the reduction, if any - double correctionHi = -0.0; // correction down stream to compensate for the exponent, if any - if( reference_fabs(x) > HEX_DBL( +, 1, 0, -, 4 ) ) + double correctionLo = + -0.0; // correction down stream to compensate for the reduction, if any + double correctionHi = + -0.0; // correction down stream to compensate for the exponent, if any + if (reference_fabs(x) > HEX_DBL(+, 1, 0, -, 4)) { - x += 1.0; // double should cover any loss of precision here + x += 1.0; // double should cover any loss of precision here // separate x into (1+f) * 2**i - union{ double d; cl_ulong u;} u; u.d = x; - int i = (int) ((u.u >> 52) & 0x7ff) - 1023; + union { + double d; + cl_ulong u; + } u; + u.d = x; + int i = (int)((u.u >> 52) & 0x7ff) - 1023; u.u &= 0x000fffffffffffffULL; - int index = (int) (u.u >> 48 ); + int index = (int)(u.u >> 48); u.u |= 0x3ff0000000000000ULL; double f = u.d; // further reduce f to be within 1/16 of 1.0 - static const double scale_table[16] = { 1.0, HEX_DBL( +, 1, d2d2d2d6e3f79, -, 1 ), HEX_DBL( +, 1, b8e38e42737a1, -, 1 ), HEX_DBL( +, 1, a1af28711adf3, -, 1 ), - HEX_DBL( +, 1, 8cccccd88dd65, -, 1 ), HEX_DBL( +, 1, 79e79e810ec8f, -, 1 ), HEX_DBL( +, 1, 68ba2e94df404, -, 1 ), HEX_DBL( +, 1, 590b216defb29, -, 1 ), - HEX_DBL( +, 1, 4aaaaab1500ed, -, 1 ), HEX_DBL( +, 1, 3d70a3e0d6f73, -, 1 ), HEX_DBL( +, 1, 313b13bb39f4f, -, 1 ), HEX_DBL( +, 1, 25ed09823f1cc, -, 1 ), - HEX_DBL( +, 1, 1b6db6e77457b, -, 1 ), HEX_DBL( +, 1, 11a7b96a3a34f, -, 1 ), HEX_DBL( +, 1, 0888888e46fea, -, 1 ), HEX_DBL( +, 1, 00000038e9862, -, 1 ) }; + static const double scale_table[16] = { + 1.0, + HEX_DBL(+, 1, d2d2d2d6e3f79, -, 1), + HEX_DBL(+, 1, b8e38e42737a1, -, 1), + HEX_DBL(+, 1, a1af28711adf3, -, 1), + HEX_DBL(+, 1, 8cccccd88dd65, -, 1), + HEX_DBL(+, 1, 79e79e810ec8f, -, 1), + HEX_DBL(+, 1, 68ba2e94df404, -, 1), + HEX_DBL(+, 1, 590b216defb29, -, 1), + HEX_DBL(+, 1, 4aaaaab1500ed, -, 1), + HEX_DBL(+, 1, 3d70a3e0d6f73, -, 1), + HEX_DBL(+, 1, 313b13bb39f4f, -, 1), + HEX_DBL(+, 1, 25ed09823f1cc, -, 1), + HEX_DBL(+, 1, 1b6db6e77457b, -, 1), + HEX_DBL(+, 1, 11a7b96a3a34f, -, 1), + HEX_DBL(+, 1, 0888888e46fea, -, 1), + HEX_DBL(+, 1, 00000038e9862, -, 1) + }; // correction_table[i] = -log( scale_table[i] ) - // All entries have >= 64 bits of precision (rather than the expected 53) - static const double correction_table[16] = { -0.0, HEX_DBL( +, 1, 7a5c722c16058, -, 4 ), HEX_DBL( +, 1, 323db16c89ab1, -, 3 ), HEX_DBL( +, 1, a0f87d180629, -, 3 ), - HEX_DBL( +, 1, 050279324e17c, -, 2 ), HEX_DBL( +, 1, 36f885bb270b0, -, 2 ), HEX_DBL( +, 1, 669b771b5cc69, -, 2 ), HEX_DBL( +, 1, 94203a6292a05, -, 2 ), - HEX_DBL( +, 1, bfb4f9cb333a4, -, 2 ), HEX_DBL( +, 1, e982376ddb80e, -, 2 ), HEX_DBL( +, 1, 08d5d8769b2b2, -, 1 ), HEX_DBL( +, 1, 1c288bc00e0cf, -, 1 ), - HEX_DBL( +, 1, 2ec7535b31ecb, -, 1 ), HEX_DBL( +, 1, 40bed0adc63fb, -, 1 ), HEX_DBL( +, 1, 521a5c0330615, -, 1 ), HEX_DBL( +, 1, 62e42f7dd092c, -, 1 ) }; + // All entries have >= 64 bits of precision (rather than the expected + // 53) + static const double correction_table[16] = { + -0.0, + HEX_DBL(+, 1, 7a5c722c16058, -, 4), + HEX_DBL(+, 1, 323db16c89ab1, -, 3), + HEX_DBL(+, 1, a0f87d180629, -, 3), + HEX_DBL(+, 1, 050279324e17c, -, 2), + HEX_DBL(+, 1, 36f885bb270b0, -, 2), + HEX_DBL(+, 1, 669b771b5cc69, -, 2), + HEX_DBL(+, 1, 94203a6292a05, -, 2), + HEX_DBL(+, 1, bfb4f9cb333a4, -, 2), + HEX_DBL(+, 1, e982376ddb80e, -, 2), + HEX_DBL(+, 1, 08d5d8769b2b2, -, 1), + HEX_DBL(+, 1, 1c288bc00e0cf, -, 1), + HEX_DBL(+, 1, 2ec7535b31ecb, -, 1), + HEX_DBL(+, 1, 40bed0adc63fb, -, 1), + HEX_DBL(+, 1, 521a5c0330615, -, 1), + HEX_DBL(+, 1, 62e42f7dd092c, -, 1) + }; f *= scale_table[index]; correctionLo = correction_table[index]; @@ -1611,17 +1879,25 @@ double reference_log1p( double x ) } - // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x = [-1/16, 1/16] + // minmax polynomial for p(x) = (log(x+1) - x)/x valid over the range x = + // [-1/16, 1/16] // max error HEX_DBL( +, 1, 048f61f9a5eca, -, 52 ) - double p = HEX_DBL( -, 1, cc33de97a9d7b, -, 46 ) + - (HEX_DBL( -, 1, fffffffff3eb7, -, 2 ) + - (HEX_DBL( +, 1, 5555555633ef7, -, 2 ) + - (HEX_DBL( -, 1, 00000062c78, -, 2 ) + - (HEX_DBL( +, 1, 9999958a3321, -, 3 ) + - (HEX_DBL( -, 1, 55534ce65c347, -, 3 ) + - (HEX_DBL( +, 1, 24957208391a5, -, 3 ) + - (HEX_DBL( -, 1, 02287b9a5b4a1, -, 3 ) + - HEX_DBL( +, 1, c757d922180ed, -, 4 ) * x)*x)*x)*x)*x)*x)*x)*x; + double p = HEX_DBL(-, 1, cc33de97a9d7b, -, 46) + + (HEX_DBL(-, 1, fffffffff3eb7, -, 2) + + (HEX_DBL(+, 1, 5555555633ef7, -, 2) + + (HEX_DBL(-, 1, 00000062c78, -, 2) + + (HEX_DBL(+, 1, 9999958a3321, -, 3) + + (HEX_DBL(-, 1, 55534ce65c347, -, 3) + + (HEX_DBL(+, 1, 24957208391a5, -, 3) + + (HEX_DBL(-, 1, 02287b9a5b4a1, -, 3) + + HEX_DBL(+, 1, c757d922180ed, -, 4) * x) + * x) + * x) + * x) + * x) + * x) + * x) + * x; // log(x+1) = x * p(x) + x x += x * p; @@ -1629,22 +1905,23 @@ double reference_log1p( double x ) return correctionHi + (correctionLo + x); } -double reference_logb( double x ) +double reference_logb(double x) { - union { float f; cl_uint u;} u; - u.f = (float) x; + union { + float f; + cl_uint u; + } u; + u.f = (float)x; cl_int exponent = (u.u >> 23) & 0xff; - if( exponent == 0xff ) - return x * x; + if (exponent == 0xff) return x * x; - if( exponent == 0 ) - { // deal with denormals + if (exponent == 0) + { // deal with denormals u.u = (u.u & 0x007fffff) | 0x3f800000; u.f -= 1.0f; exponent = (u.u >> 23) & 0xff; - if( exponent == 0 ) - return -INFINITY; + if (exponent == 0) return -INFINITY; return exponent - (127 + 126); } @@ -1652,219 +1929,271 @@ double reference_logb( double x ) return exponent - 127; } -double reference_relaxed_reciprocal(double x) -{ - return 1.0f / ((float) x); -} +double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); } -double reference_reciprocal( double x ) -{ - return 1.0 / x; -} +double reference_reciprocal(double x) { return 1.0 / x; } -double reference_remainder( double x, double y ) +double reference_remainder(double x, double y) { int i; - return reference_remquo( x, y, &i ); + return reference_remquo(x, y, &i); } -double reference_lgamma( double x) +double reference_lgamma(double x) { -/* - * ==================================================== - * This function is from fdlibm. http://www.netlib.org - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - * - */ + /* + * ==================================================== + * This function is from fdlibm. http://www.netlib.org + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + * + */ -static const double //two52 = 4.50359962737049600000e+15, /* 0x43300000, 0x00000000 */ - half= 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */ - one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */ - pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */ - a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */ - a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */ - a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */ - a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */ - a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */ - a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */ - a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */ - a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */ - a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */ - a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */ - a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */ - a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */ - tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */ - tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */ - /* tt = -(tail of tf) */ - tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */ - t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */ - t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */ - t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */ - t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */ - t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */ - t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */ - t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */ - t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */ - t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */ - t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */ - t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */ - t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */ - t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */ - t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */ - t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */ - u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ - u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */ - u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */ - u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */ - u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */ - u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */ - v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */ - v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */ - v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */ - v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */ - v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */ - s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ - s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */ - s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */ - s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */ - s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */ - s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */ - s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */ - r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */ - r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */ - r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */ - r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */ - r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */ - r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */ - w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */ - w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */ - w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */ - w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */ - w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */ - w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */ - w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ + static const double // two52 = 4.50359962737049600000e+15, /* 0x43300000, + // 0x00000000 */ + half = 5.00000000000000000000e-01, /* 0x3FE00000, + 0x00000000 */ + one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */ + pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */ + a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */ + a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */ + a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */ + a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */ + a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */ + a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */ + a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */ + a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */ + a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */ + a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */ + a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */ + a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */ + tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */ + tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */ + /* tt = -(tail of tf) */ + tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */ + t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */ + t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */ + t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */ + t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */ + t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */ + t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */ + t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */ + t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */ + t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */ + t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */ + t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */ + t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */ + t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */ + t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */ + t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */ + u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ + u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */ + u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */ + u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */ + u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */ + u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */ + v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */ + v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */ + v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */ + v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */ + v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */ + s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */ + s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */ + s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */ + s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */ + s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */ + s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */ + s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */ + r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */ + r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */ + r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */ + r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */ + r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */ + r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */ + w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */ + w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */ + w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */ + w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */ + w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */ + w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */ + w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ - static const double zero= 0.00000000000000000000e+00; - double t,y,z,nadj,p,p1,p2,p3,q,r,w; - cl_int i,hx,lx,ix; + static const double zero = 0.00000000000000000000e+00; + double t, y, z, nadj, p, p1, p2, p3, q, r, w; + cl_int i, hx, lx, ix; - union{ double d; cl_ulong u;}u; u.d = x; + union { + double d; + cl_ulong u; + } u; + u.d = x; - hx = (cl_int) (u.u >> 32); - lx = (cl_int) (u.u & 0xffffffffULL); + hx = (cl_int)(u.u >> 32); + lx = (cl_int)(u.u & 0xffffffffULL); /* purge off +-inf, NaN, +-0, and negative arguments */ -// *signgamp = 1; - ix = hx&0x7fffffff; - if(ix>=0x7ff00000) return x*x; - if((ix|lx)==0) return INFINITY; - if(ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */ - if(hx<0) { -// *signgamp = -1; + // *signgamp = 1; + ix = hx & 0x7fffffff; + if (ix >= 0x7ff00000) return x * x; + if ((ix | lx) == 0) return INFINITY; + if (ix < 0x3b900000) + { /* |x|<2**-70, return -log(|x|) */ + if (hx < 0) + { + // *signgamp = -1; return -reference_log(-x); - } else return -reference_log(x); + } + else + return -reference_log(x); } - if(hx<0) { - if(ix>=0x43300000) /* |x|>=2**52, must be -integer */ - return INFINITY; + if (hx < 0) + { + if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */ + return INFINITY; t = reference_sinpi(x); - if(t==zero) return INFINITY; /* -integer */ - nadj = reference_log(pi/reference_fabs(t*x)); -// if(t=0x3FE76944) {y = 1.0-x; i= 0;} - else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;} - else {y = x; i=2;} - } else { - r = zero; - if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */ - else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */ - else {y=x-one;i=2;} + else if (ix < 0x40000000) + { + if (ix <= 0x3feccccc) + { /* lgamma(x) = lgamma(x+1)-log(x) */ + r = -reference_log(x); + if (ix >= 0x3FE76944) + { + y = 1.0 - x; + i = 0; + } + else if (ix >= 0x3FCDA661) + { + y = x - (tc - one); + i = 1; + } + else + { + y = x; + i = 2; + } } - switch(i) { - case 0: - z = y*y; - p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); - p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); - p = y*p1+p2; - r += (p-0.5*y); break; - case 1: - z = y*y; - w = z*y; - p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ - p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); - p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); - p = z*p1-(tt-w*(p2+y*p3)); - r += (tf + p); break; - case 2: - p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); - p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); - r += (-0.5*y + p1/p2); + else + { + r = zero; + if (ix >= 0x3FFBB4C3) + { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ + else if (ix >= 0x3FF3B4C4) + { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ + else + { + y = x - one; + i = 2; + } + } + switch (i) + { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z + * (a1 + + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + + w + * (t3 + + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y + * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); } } - else if(ix<0x40200000) { /* x < 8.0 */ + else if (ix < 0x40200000) + { /* x < 8.0 */ i = (int)x; t = zero; - y = x-(double)i; - p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); - q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); - r = half*y+p/q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch(i) { - case 7: z *= (y+6.0); /* FALLTHRU */ - case 6: z *= (y+5.0); /* FALLTHRU */ - case 5: z *= (y+4.0); /* FALLTHRU */ - case 4: z *= (y+3.0); /* FALLTHRU */ - case 3: z *= (y+2.0); /* FALLTHRU */ - r += reference_log(z); break; + y = x - (double)i; + p = y + * (s0 + + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) + { + case 7: z *= (y + 6.0); /* FALLTHRU */ + case 6: z *= (y + 5.0); /* FALLTHRU */ + case 5: z *= (y + 4.0); /* FALLTHRU */ + case 4: z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += reference_log(z); + break; } - /* 8.0 <= x < 2**58 */ - } else if (ix < 0x43900000) { + /* 8.0 <= x < 2**58 */ + } + else if (ix < 0x43900000) + { t = reference_log(x); - z = one/x; - y = z*z; - w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); - r = (x-half)*(t-one)+w; - } else - /* 2**58 <= x <= inf */ - r = x*(reference_log(x)-one); - if(hx<0) r = nadj - r; + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; + } + else + /* 2**58 <= x <= inf */ + r = x * (reference_log(x) - one); + if (hx < 0) r = nadj - r; return r; - } #endif // _MSC_VER -double reference_assignment( double x ){ return x; } +double reference_assignment(double x) { return x; } -int reference_not( double x ) +int reference_not(double x) { - int r = !x; - return r; + int r = !x; + return r; } #pragma mark - #pragma mark Double testing #ifndef M_PIL - #define M_PIL 3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L +#define M_PIL \ + 3.14159265358979323846264338327950288419716939937510582097494459230781640628620899L #endif -static long double reduce1l( long double x ); +static long double reduce1l(long double x); #ifdef __PPC__ // Since long double on PPC is really extended precision double arithmetic @@ -1873,36 +2202,35 @@ static long double reduce1l( long double x ); // such that reduction algorithm used for other architectures will not work. // Instead and alternate reduction method is used. -static long double reduce1l( long double x ) +static long double reduce1l(long double x) { - union { - long double ld; - double d[2]; - } u; + union { + long double ld; + double d[2]; + } u; - // Reduce the high and low halfs separately. - u.ld = x; - return ((long double)reduce1(u.d[0]) + reduce1(u.d[1])); + // Reduce the high and low halfs separately. + u.ld = x; + return ((long double)reduce1(u.d[0]) + reduce1(u.d[1])); } #else // !__PPC__ -static long double reduce1l( long double x ) +static long double reduce1l(long double x) { static long double unit_exp = 0; - if( 0.0L == unit_exp ) - unit_exp = scalbnl( 1.0L, LDBL_MANT_DIG); + if (0.0L == unit_exp) unit_exp = scalbnl(1.0L, LDBL_MANT_DIG); - if( reference_fabsl(x) >= unit_exp ) + if (reference_fabsl(x) >= unit_exp) { - if( reference_fabsl(x) == INFINITY ) - return cl_make_nan(); + if (reference_fabsl(x) == INFINITY) return cl_make_nan(); - return 0.0L; //we patch up the sign for sinPi and cosPi later, since they need different signs + return 0.0L; // we patch up the sign for sinPi and cosPi later, since + // they need different signs } // Find the nearest multiple of 2 - const long double r = reference_copysignl( unit_exp, x ); + const long double r = reference_copysignl(unit_exp, x); long double z = x + r; z -= r; @@ -1911,19 +2239,31 @@ static long double reduce1l( long double x ) } #endif // __PPC__ -long double reference_acospil( long double x){ return reference_acosl( x ) / M_PIL; } -long double reference_asinpil( long double x){ return reference_asinl( x ) / M_PIL; } -long double reference_atanpil( long double x){ return reference_atanl( x ) / M_PIL; } -long double reference_atan2pil( long double y, long double x){ return reference_atan2l( y, x) / M_PIL; } -long double reference_cospil( long double x) +long double reference_acospil(long double x) { - if( reference_fabsl(x) >= HEX_LDBL( +, 1, 0, +, 54 ) ) + return reference_acosl(x) / M_PIL; +} +long double reference_asinpil(long double x) +{ + return reference_asinl(x) / M_PIL; +} +long double reference_atanpil(long double x) +{ + return reference_atanl(x) / M_PIL; +} +long double reference_atan2pil(long double y, long double x) +{ + return reference_atan2l(y, x) / M_PIL; +} +long double reference_cospil(long double x) +{ + if (reference_fabsl(x) >= HEX_LDBL(+, 1, 0, +, 54)) { - if( reference_fabsl(x) == INFINITY ) - return cl_make_nan(); + if (reference_fabsl(x) == INFINITY) return cl_make_nan(); - //Note this probably fails for odd values between 0x1.0p52 and 0x1.0p53. - //However, when starting with single precision inputs, there will be no odd values. + // Note this probably fails for odd values between 0x1.0p52 and + // 0x1.0p53. However, when starting with single precision inputs, there + // will be no odd values. return 1.0L; } @@ -1935,9 +2275,9 @@ long double reference_cospil( long double x) // phase adjust double xhi = 0.0; double xlo = 0.0; - xhi = (double) x + 0.5; + xhi = (double)x + 0.5; - if(reference_fabsl(x) > 0.5L) + if (reference_fabsl(x) > 0.5L) { xlo = xhi - x; xlo = 0.5 - xlo; @@ -1949,61 +2289,69 @@ long double reference_cospil( long double x) } // reduce to [-0.5, 0.5] - if( xhi < -0.5 ) + if (xhi < -0.5) { xhi = -1.0 - xhi; xlo = -xlo; } - else if ( xhi > 0.5 ) + else if (xhi > 0.5) { xhi = 1.0 - xhi; xlo = -xlo; } // cosPi zeros are all +0 - if( xhi == 0.0 && xlo == 0.0 ) - return 0.0; + if (xhi == 0.0 && xlo == 0.0) return 0.0; xhi *= M_PI; xlo *= M_PI; xhi += xlo; - return reference_sinl( xhi ); + return reference_sinl(xhi); #else // phase adjust x += 0.5L; // reduce to [-0.5, 0.5] - if( x < -0.5L ) + if (x < -0.5L) x = -1.0L - x; - else if ( x > 0.5L ) + else if (x > 0.5L) x = 1.0L - x; // cosPi zeros are all +0 - if( x == 0.0L ) - return 0.0L; + if (x == 0.0L) return 0.0L; - return reference_sinl( x * M_PIL ); + return reference_sinl(x * M_PIL); #endif } -long double reference_dividel( long double x, long double y) +long double reference_dividel(long double x, long double y) { double dx = x; double dy = y; - return dx/dy; + return dx / dy; } -typedef struct{ double hi, lo; } double_double; - -// Split doubles_double into a series of consecutive 26-bit precise doubles and a remainder. -// Note for later -- for multiplication, it might be better to split each double into a power of two and two 26 bit portions -// multiplication of a double double by a known power of two is cheap. The current approach causes some inexact arithmetic in mul_dd. -static inline void split_dd( double_double x, double_double *hi, double_double *lo ) +typedef struct { - union{ double d; cl_ulong u;}u; + double hi, lo; +} double_double; + +// Split doubles_double into a series of consecutive 26-bit precise doubles and +// a remainder. Note for later -- for multiplication, it might be better to +// split each double into a power of two and two 26 bit portions +// multiplication of a double double by a known power of +// two is cheap. The current approach causes some inexact +// arithmetic in mul_dd. +static inline void split_dd(double_double x, double_double *hi, + double_double *lo) +{ + union { + double d; + cl_ulong u; + } u; u.d = x.hi; u.u &= 0xFFFFFFFFF8000000ULL; hi->hi = u.d; @@ -2025,10 +2373,10 @@ static inline void split_dd( double_double x, double_double *hi, double_double * lo->lo = x.hi + x.lo; } -static inline double_double accum_d( double_double a, double b ) +static inline double_double accum_d(double_double a, double b) { double temp; - if( fabs(b) > fabs(a.hi) ) + if (fabs(b) > fabs(a.hi)) { temp = a.hi; a.hi += b; @@ -2041,47 +2389,45 @@ static inline double_double accum_d( double_double a, double b ) a.lo += b - (a.hi - temp); } - if( isnan( a.lo ) ) - a.lo = 0.0; + if (isnan(a.lo)) a.lo = 0.0; return a; } -static inline double_double add_dd( double_double a, double_double b ) +static inline double_double add_dd(double_double a, double_double b) { - double_double r = {-0.0 -0.0 }; + double_double r = { -0.0 - 0.0 }; - if( isinf(a.hi) || isinf( b.hi ) || - isnan(a.hi) || isnan( b.hi ) || - 0.0 == a.hi || 0.0 == b.hi ) + if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi + || 0.0 == b.hi) { r.hi = a.hi + b.hi; r.lo = a.lo + b.lo; - if( isnan( r.lo ) ) - r.lo = 0.0; + if (isnan(r.lo)) r.lo = 0.0; return r; } - //merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|, |b.hi| > |b.lo|, so we don't have to do the first merge pass + // merge sort terms by magnitude -- here we assume that |a.hi| > |a.lo|, + // |b.hi| > |b.lo|, so we don't have to do the first merge pass double terms[4] = { a.hi, b.hi, a.lo, b.lo }; double temp; - //Sort hi terms - if( fabs(terms[0]) < fabs(terms[1]) ) + // Sort hi terms + if (fabs(terms[0]) < fabs(terms[1])) { temp = terms[0]; terms[0] = terms[1]; terms[1] = temp; } - //sort lo terms - if( fabs(terms[2]) < fabs(terms[3]) ) + // sort lo terms + if (fabs(terms[2]) < fabs(terms[3])) { temp = terms[2]; terms[2] = terms[3]; terms[3] = temp; } // Fix case where small high term is less than large low term - if( fabs(terms[1]) < fabs(terms[2]) ) + if (fabs(terms[1]) < fabs(terms[2])) { temp = terms[1]; terms[1] = terms[2]; @@ -2104,42 +2450,40 @@ static inline double_double add_dd( double_double a, double_double b ) temp = r.hi; r.hi += r.lo; r.lo = r.lo - (r.hi - temp); - if( isnan( r.lo ) ) - r.lo = 0.0; + if (isnan(r.lo)) r.lo = 0.0; return r; } -static inline double_double mul_dd( double_double a, double_double b ) +static inline double_double mul_dd(double_double a, double_double b) { - double_double result = {-0.0,-0.0}; + double_double result = { -0.0, -0.0 }; // Inf, nan and 0 - if( isnan( a.hi ) || isnan( b.hi ) || - isinf( a.hi ) || isinf( b.hi ) || - 0.0 == a.hi || 0.0 == b.hi ) + if (isnan(a.hi) || isnan(b.hi) || isinf(a.hi) || isinf(b.hi) || 0.0 == a.hi + || 0.0 == b.hi) { result.hi = a.hi * b.hi; return result; } double_double ah, al, bh, bl; - split_dd( a, &ah, &al ); - split_dd( b, &bh, &bl ); + split_dd(a, &ah, &al); + split_dd(b, &bh, &bl); - double p0 = ah.hi * bh.hi; // exact (52 bits in product) 0 - double p1 = ah.hi * bh.lo; // exact (52 bits in product) 26 - double p2 = ah.lo * bh.hi; // exact (52 bits in product) 26 - double p3 = ah.lo * bh.lo; // exact (52 bits in product) 52 - double p4 = al.hi * bh.hi; // exact (52 bits in product) 52 - double p5 = al.hi * bh.lo; // exact (52 bits in product) 78 - double p6 = al.lo * bh.hi; // inexact (54 bits in product) 78 - double p7 = al.lo * bh.lo; // inexact (54 bits in product) 104 - double p8 = ah.hi * bl.hi; // exact (52 bits in product) 52 - double p9 = ah.hi * bl.lo; // inexact (54 bits in product) 78 - double pA = ah.lo * bl.hi; // exact (52 bits in product) 78 - double pB = ah.lo * bl.lo; // inexact (54 bits in product) 104 - double pC = al.hi * bl.hi; // exact (52 bits in product) 104 + double p0 = ah.hi * bh.hi; // exact (52 bits in product) 0 + double p1 = ah.hi * bh.lo; // exact (52 bits in product) 26 + double p2 = ah.lo * bh.hi; // exact (52 bits in product) 26 + double p3 = ah.lo * bh.lo; // exact (52 bits in product) 52 + double p4 = al.hi * bh.hi; // exact (52 bits in product) 52 + double p5 = al.hi * bh.lo; // exact (52 bits in product) 78 + double p6 = al.lo * bh.hi; // inexact (54 bits in product) 78 + double p7 = al.lo * bh.lo; // inexact (54 bits in product) 104 + double p8 = ah.hi * bl.hi; // exact (52 bits in product) 52 + double p9 = ah.hi * bl.lo; // inexact (54 bits in product) 78 + double pA = ah.lo * bl.hi; // exact (52 bits in product) 78 + double pB = ah.lo * bl.lo; // inexact (54 bits in product) 104 + double pC = al.hi * bl.hi; // exact (52 bits in product) 104 // the last 3 terms are two low to appear in the result @@ -2169,46 +2513,60 @@ static inline double_double mul_dd( double_double a, double_double b ) return result; #else - // take advantage of the known relative magnitudes of the partial products to avoid some sorting - // Combine 2**-78 and 2**-104 terms. Here we are a bit sloppy about canonicalizing the double_doubles + // take advantage of the known relative magnitudes of the partial products + // to avoid some sorting Combine 2**-78 and 2**-104 terms. Here we are a bit + // sloppy about canonicalizing the double_doubles double_double t0 = { pA, pC }; double_double t1 = { p9, pB }; double_double t2 = { p6, p7 }; double temp0, temp1, temp2; - t0 = accum_d( t0, p5 ); // there is an extra 2**-78 term to deal with + t0 = accum_d(t0, p5); // there is an extra 2**-78 term to deal with - // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the double_doubles - temp0 = t0.hi; temp1 = t1.hi; temp2 = t2.hi; - t0.hi += p3; t1.hi += p4; t2.hi += p8; - temp0 -= t0.hi-p3; temp1 -= t1.hi-p4; temp2 -= t2.hi - p8; - t0.lo += temp0; t1.lo += temp1; t2.lo += temp2; + // Add in 2**-52 terms. Here we are a bit sloppy about canonicalizing the + // double_doubles + temp0 = t0.hi; + temp1 = t1.hi; + temp2 = t2.hi; + t0.hi += p3; + t1.hi += p4; + t2.hi += p8; + temp0 -= t0.hi - p3; + temp1 -= t1.hi - p4; + temp2 -= t2.hi - p8; + t0.lo += temp0; + t1.lo += temp1; + t2.lo += temp2; - // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the double_doubles - temp1 = t1.hi; temp2 = t2.hi; - t1.hi += p1; t2.hi += p2; - temp1 -= t1.hi-p1; temp2 -= t2.hi - p2; - t1.lo += temp1; t2.lo += temp2; + // Add in 2**-26 terms. Here we are a bit sloppy about canonicalizing the + // double_doubles + temp1 = t1.hi; + temp2 = t2.hi; + t1.hi += p1; + t2.hi += p2; + temp1 -= t1.hi - p1; + temp2 -= t2.hi - p2; + t1.lo += temp1; + t2.lo += temp2; // Combine accumulators to get the low bits of result - t1 = add_dd( t1, add_dd( t2, t0 ) ); + t1 = add_dd(t1, add_dd(t2, t0)); // Add in MSB's, and round to precision - return accum_d( t1, p0 ); // canonicalizes + return accum_d(t1, p0); // canonicalizes #endif - } -long double reference_exp10l( long double z ) +long double reference_exp10l(long double z) { - const double_double log2_10 = { HEX_DBL( +, 1, a934f0979a371, +, 1 ), HEX_DBL( +, 1, 7f2495fb7fa6d, -, 53 ) }; + const double_double log2_10 = { HEX_DBL(+, 1, a934f0979a371, +, 1), + HEX_DBL(+, 1, 7f2495fb7fa6d, -, 53) }; double_double x; int j; // Handle NaNs - if( isnan(z) ) - return z; + if (isnan(z)) return z; // init x x.hi = z; @@ -2217,172 +2575,195 @@ long double reference_exp10l( long double z ) // 10**x = exp2( x * log2(10) ) - x = mul_dd( x, log2_10); // x * log2(10) + x = mul_dd(x, log2_10); // x * log2(10) - //Deal with overflow and underflow for exp2(x) stage next - if( x.hi >= 1025 ) - return INFINITY; + // Deal with overflow and underflow for exp2(x) stage next + if (x.hi >= 1025) return INFINITY; - if( x.hi < -1075-24 ) - return +0.0; + if (x.hi < -1075 - 24) return +0.0; // find nearest integer to x - int i = (int) rint(x.hi); + int i = (int)rint(x.hi); // x now holds fractional part. The result would be then 2**i * exp2( x ) x.hi -= i; - // We could attempt to find a minimax polynomial for exp2(x) over the range x = [-0.5, 0.5]. - // However, this would converge very slowly near the extrema, where 0.5**n is not a lot different - // from 0.5**(n+1), thereby requiring something like a 20th order polynomial to get 53 + 24 bits - // of precision. Instead we further reduce the range to [-1/32, 1/32] by observing that + // We could attempt to find a minimax polynomial for exp2(x) over the range + // x = [-0.5, 0.5]. However, this would converge very slowly near the + // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby + // requiring something like a 20th order polynomial to get 53 + 24 bits of + // precision. Instead we further reduce the range to [-1/32, 1/32] by + // observing that // // 2**(a+b) = 2**a * 2**b // - // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and reduce the range - // of x to [-1/32, 1/32] by subtracting away the nearest value of n/16 from x. - const double_double corrections[17] = - { - { HEX_DBL( +, 1, 6a09e667f3bcd, -, 1 ), HEX_DBL( -, 1, bdd3413b26456, -, 55 ) }, - { HEX_DBL( +, 1, 7a11473eb0187, -, 1 ), HEX_DBL( -, 1, 41577ee04992f, -, 56 ) }, - { HEX_DBL( +, 1, 8ace5422aa0db, -, 1 ), HEX_DBL( +, 1, 6e9f156864b27, -, 55 ) }, - { HEX_DBL( +, 1, 9c49182a3f09, -, 1 ), HEX_DBL( +, 1, c7c46b071f2be, -, 57 ) }, - { HEX_DBL( +, 1, ae89f995ad3ad, -, 1 ), HEX_DBL( +, 1, 7a1cd345dcc81, -, 55 ) }, - { HEX_DBL( +, 1, c199bdd85529c, -, 1 ), HEX_DBL( +, 1, 11065895048dd, -, 56 ) }, - { HEX_DBL( +, 1, d5818dcfba487, -, 1 ), HEX_DBL( +, 1, 2ed02d75b3707, -, 56 ) }, - { HEX_DBL( +, 1, ea4afa2a490da, -, 1 ), HEX_DBL( -, 1, e9c23179c2893, -, 55 ) }, - { HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ) }, - { HEX_DBL( +, 1, 0b5586cf9890f, +, 0 ), HEX_DBL( +, 1, 8a62e4adc610b, -, 54 ) }, - { HEX_DBL( +, 1, 172b83c7d517b, +, 0 ), HEX_DBL( -, 1, 19041b9d78a76, -, 55 ) }, - { HEX_DBL( +, 1, 2387a6e756238, +, 0 ), HEX_DBL( +, 1, 9b07eb6c70573, -, 54 ) }, - { HEX_DBL( +, 1, 306fe0a31b715, +, 0 ), HEX_DBL( +, 1, 6f46ad23182e4, -, 55 ) }, - { HEX_DBL( +, 1, 3dea64c123422, +, 0 ), HEX_DBL( +, 1, ada0911f09ebc, -, 55 ) }, - { HEX_DBL( +, 1, 4bfdad5362a27, +, 0 ), HEX_DBL( +, 1, d4397afec42e2, -, 56 ) }, - { HEX_DBL( +, 1, 5ab07dd485429, +, 0 ), HEX_DBL( +, 1, 6324c054647ad, -, 54 ) }, - { HEX_DBL( +, 1, 6a09e667f3bcd, +, 0 ), HEX_DBL( -, 1, bdd3413b26456, -, 54 ) } + // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and + // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest + // value of n/16 from x. + const double_double corrections[17] = { + { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1), + HEX_DBL(-, 1, bdd3413b26456, -, 55) }, + { HEX_DBL(+, 1, 7a11473eb0187, -, 1), + HEX_DBL(-, 1, 41577ee04992f, -, 56) }, + { HEX_DBL(+, 1, 8ace5422aa0db, -, 1), + HEX_DBL(+, 1, 6e9f156864b27, -, 55) }, + { HEX_DBL(+, 1, 9c49182a3f09, -, 1), + HEX_DBL(+, 1, c7c46b071f2be, -, 57) }, + { HEX_DBL(+, 1, ae89f995ad3ad, -, 1), + HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) }, + { HEX_DBL(+, 1, c199bdd85529c, -, 1), + HEX_DBL(+, 1, 11065895048dd, -, 56) }, + { HEX_DBL(+, 1, d5818dcfba487, -, 1), + HEX_DBL(+, 1, 2ed02d75b3707, -, 56) }, + { HEX_DBL(+, 1, ea4afa2a490da, -, 1), + HEX_DBL(-, 1, e9c23179c2893, -, 55) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, 0b5586cf9890f, +, 0), + HEX_DBL(+, 1, 8a62e4adc610b, -, 54) }, + { HEX_DBL(+, 1, 172b83c7d517b, +, 0), + HEX_DBL(-, 1, 19041b9d78a76, -, 55) }, + { HEX_DBL(+, 1, 2387a6e756238, +, 0), + HEX_DBL(+, 1, 9b07eb6c70573, -, 54) }, + { HEX_DBL(+, 1, 306fe0a31b715, +, 0), + HEX_DBL(+, 1, 6f46ad23182e4, -, 55) }, + { HEX_DBL(+, 1, 3dea64c123422, +, 0), + HEX_DBL(+, 1, ada0911f09ebc, -, 55) }, + { HEX_DBL(+, 1, 4bfdad5362a27, +, 0), + HEX_DBL(+, 1, d4397afec42e2, -, 56) }, + { HEX_DBL(+, 1, 5ab07dd485429, +, 0), + HEX_DBL(+, 1, 6324c054647ad, -, 54) }, + { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0), + HEX_DBL(-, 1, bdd3413b26456, -, 54) } }; - int index = (int) rint( x.hi * 16.0 ); - x.hi -= (double) index * 0.0625; + int index = (int)rint(x.hi * 16.0); + x.hi -= (double)index * 0.0625; // canonicalize x double temp = x.hi; x.hi += x.lo; x.lo -= x.hi - temp; - // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max Error: 2 * 0x1.e112p-87 - const double_double c[] = { - {HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ), HEX_DBL( +, 1, abc9e3ac1d244, -, 56 )}, - {HEX_DBL( +, 1, ebfbdff82c58f, -, 3 ), HEX_DBL( -, 1, 5e4987a631846, -, 57 )}, - {HEX_DBL( +, 1, c6b08d704a0c, -, 5 ), HEX_DBL( -, 1, d323200a05713, -, 59 )}, - {HEX_DBL( +, 1, 3b2ab6fba4e7a, -, 7 ), HEX_DBL( +, 1, c5ee8f8b9f0c1, -, 63 )}, - {HEX_DBL( +, 1, 5d87fe78a672a, -, 10 ), HEX_DBL( +, 1, 884e5e5cc7ecc, -, 64 )}, - {HEX_DBL( +, 1, 430912f7e8373, -, 13 ), HEX_DBL( +, 1, 4f1b59514a326, -, 67 )}, - {HEX_DBL( +, 1, ffcbfc5985e71, -, 17 ), HEX_DBL( -, 1, db7d6a0953b78, -, 71 )}, - {HEX_DBL( +, 1, 62c150eb16465, -, 20 ), HEX_DBL( +, 1, e0767c2d7abf5, -, 80 )}, - {HEX_DBL( +, 1, b52502b5e953, -, 24 ), HEX_DBL( +, 1, 6797523f944bc, -, 78 )} - }; - size_t count = sizeof( c ) / sizeof( c[0] ); + // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max + // Error: 2 * 0x1.e112p-87 + const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1), + HEX_DBL(+, 1, abc9e3ac1d244, -, 56) }, + { HEX_DBL(+, 1, ebfbdff82c58f, -, 3), + HEX_DBL(-, 1, 5e4987a631846, -, 57) }, + { HEX_DBL(+, 1, c6b08d704a0c, -, 5), + HEX_DBL(-, 1, d323200a05713, -, 59) }, + { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7), + HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) }, + { HEX_DBL(+, 1, 5d87fe78a672a, -, 10), + HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) }, + { HEX_DBL(+, 1, 430912f7e8373, -, 13), + HEX_DBL(+, 1, 4f1b59514a326, -, 67) }, + { HEX_DBL(+, 1, ffcbfc5985e71, -, 17), + HEX_DBL(-, 1, db7d6a0953b78, -, 71) }, + { HEX_DBL(+, 1, 62c150eb16465, -, 20), + HEX_DBL(+, 1, e0767c2d7abf5, -, 80) }, + { HEX_DBL(+, 1, b52502b5e953, -, 24), + HEX_DBL(+, 1, 6797523f944bc, -, 78) } }; + size_t count = sizeof(c) / sizeof(c[0]); // Do polynomial - double_double r = c[count-1]; - for( j = (int) count-2; j >= 0; j-- ) - r = add_dd( c[j], mul_dd( r, x ) ); + double_double r = c[count - 1]; + for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x)); // unwind approximation - r = mul_dd( r, x ); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1 + r = mul_dd(r, x); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1 // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above // exp2(x) = (r + 1) * correction = r * correction + correction - r = mul_dd( r, corrections[index+8] ); - r = add_dd( r, corrections[index+8] ); + r = mul_dd(r, corrections[index + 8]); + r = add_dd(r, corrections[index + 8]); -// Format result for output: + // Format result for output: // Get mantissa - long double m = ((long double) r.hi + (long double) r.lo ); + long double m = ((long double)r.hi + (long double)r.lo); // Handle a pesky overflow cases when long double = double - if( i > 512 ) + if (i > 512) { - m *= HEX_DBL( +, 1, 0, +, 512 ); + m *= HEX_DBL(+, 1, 0, +, 512); i -= 512; } - else if( i < -512 ) + else if (i < -512) { - m *= HEX_DBL( +, 1, 0, -, 512 ); + m *= HEX_DBL(+, 1, 0, -, 512); i += 512; } - return m * ldexpl( 1.0L, i ); + return m * ldexpl(1.0L, i); } -static double fallback_frexp( double x, int *iptr ) +static double fallback_frexp(double x, int *iptr) { cl_ulong u, v; double fu, fv; - memcpy( &u, &x, sizeof(u)); + memcpy(&u, &x, sizeof(u)); - cl_ulong exponent = u & 0x7ff0000000000000ULL; + cl_ulong exponent = u & 0x7ff0000000000000ULL; cl_ulong mantissa = u & ~0x7ff0000000000000ULL; // add 1 to the exponent exponent += 0x0010000000000000ULL; - if( (cl_long) exponent < (cl_long) 0x0020000000000000LL ) + if ((cl_long)exponent < (cl_long)0x0020000000000000LL) { // subnormal, NaN, Inf mantissa |= 0x3fe0000000000000ULL; v = mantissa & 0xfff0000000000000ULL; u = mantissa; - memcpy( &fv, &v, sizeof(v)); - memcpy( &fu, &u, sizeof(u)); + memcpy(&fv, &v, sizeof(v)); + memcpy(&fu, &u, sizeof(u)); fu -= fv; - memcpy( &v, &fv, sizeof(v)); - memcpy( &u, &fu, sizeof(u)); + memcpy(&v, &fv, sizeof(v)); + memcpy(&u, &fu, sizeof(u)); - exponent = u & 0x7ff0000000000000ULL; + exponent = u & 0x7ff0000000000000ULL; mantissa = u & ~0x7ff0000000000000ULL; - *iptr = (exponent >> 52) + (-1022 + 1 -1022); + *iptr = (exponent >> 52) + (-1022 + 1 - 1022); u = mantissa | 0x3fe0000000000000ULL; - memcpy( &fu, &u, sizeof(u)); + memcpy(&fu, &u, sizeof(u)); return fu; } *iptr = (exponent >> 52) - 1023; u = mantissa | 0x3fe0000000000000ULL; - memcpy( &fu, &u, sizeof(u)); + memcpy(&fu, &u, sizeof(u)); return fu; } // Assumes zeros, infinities and NaNs handed elsewhere -static inline int extract( double x, cl_ulong *mant ); -static inline int extract( double x, cl_ulong *mant ) +static inline int extract(double x, cl_ulong *mant); +static inline int extract(double x, cl_ulong *mant) { - static double (*frexpp)(double, int*) = NULL; + static double (*frexpp)(double, int *) = NULL; int e; // verify that frexp works properly - if( NULL == frexpp ) + if (NULL == frexpp) { - if( 0.5 == frexp( HEX_DBL( +, 1, 0, -, 1030 ), &e ) && e == -1029 ) + if (0.5 == frexp(HEX_DBL(+, 1, 0, -, 1030), &e) && e == -1029) frexpp = frexp; else frexpp = fallback_frexp; } - *mant = (cl_ulong) (HEX_DBL( +, 1, 0, +, 64 ) * fabs( frexpp( x, &e ))); + *mant = (cl_ulong)(HEX_DBL(+, 1, 0, +, 64) * fabs(frexpp(x, &e))); return e - 1; } // Return 128-bit product of a*b as (hi << 64) + lo -static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo ); -static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo ) +static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo); +static inline void mul128(cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo) { cl_ulong alo = a & 0xffffffffULL; cl_ulong ahi = a >> 32; @@ -2393,16 +2774,22 @@ static inline void mul128( cl_ulong a, cl_ulong b, cl_ulong *hi, cl_ulong *lo ) cl_ulong ahiblo = ahi * blo; cl_ulong ahibhi = ahi * bhi; - alobhi += (aloblo >> 32) + (ahiblo & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) = (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1 - *hi = ahibhi + (alobhi >> 32) + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) = (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1 + alobhi += (aloblo >> 32) + + (ahiblo + & 0xffffffffULL); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) = + // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1 + *hi = ahibhi + (alobhi >> 32) + + (ahiblo >> 32); // cannot overflow: (2^32-1)^2 + 2 * (2^32-1) = + // (2^64 - 2^33 + 1) + (2^33 - 2) = 2^64 - 1 *lo = (aloblo & 0xffffffffULL) | (alobhi << 32); } // Move the most significant non-zero bit to the MSB -// Note: not general. Only works if the most significant non-zero bit is at MSB-1 -static inline void renormalize( cl_ulong *hi, cl_ulong *lo, int *exponent ) +// Note: not general. Only works if the most significant non-zero bit is at +// MSB-1 +static inline void renormalize(cl_ulong *hi, cl_ulong *lo, int *exponent) { - if( 0 == (0x8000000000000000ULL & *hi )) + if (0 == (0x8000000000000000ULL & *hi)) { *hi <<= 1; *hi |= *lo >> 63; @@ -2411,74 +2798,84 @@ static inline void renormalize( cl_ulong *hi, cl_ulong *lo, int *exponent ) } } -static double round_to_nearest_even_double( cl_ulong hi, cl_ulong lo, int exponent ); -static double round_to_nearest_even_double( cl_ulong hi, cl_ulong lo, int exponent ) +static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo, + int exponent); +static double round_to_nearest_even_double(cl_ulong hi, cl_ulong lo, + int exponent) { - union{ cl_ulong u; cl_double d;} u; + union { + cl_ulong u; + cl_double d; + } u; // edges - if( exponent > 1023 ) return INFINITY; - if( exponent == -1075 && (hi | (lo!=0)) > 0x8000000000000000ULL ) - return HEX_DBL( +, 1, 0, -, 1074 ); - if( exponent <= -1075 ) return 0.0; + if (exponent > 1023) return INFINITY; + if (exponent == -1075 && (hi | (lo != 0)) > 0x8000000000000000ULL) + return HEX_DBL(+, 1, 0, -, 1074); + if (exponent <= -1075) return 0.0; - //Figure out which bits go where + // Figure out which bits go where int shift = 11; - if( exponent < -1022 ) + if (exponent < -1022) { - shift -= 1022 + exponent; // subnormal: shift is not 52 - exponent = -1023; // set exponent to 0 + shift -= 1022 + exponent; // subnormal: shift is not 52 + exponent = -1023; // set exponent to 0 } else - hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove it. + hi &= 0x7fffffffffffffffULL; // normal: leading bit is implicit. Remove + // it. // Assemble the double (round toward zero) - u.u = (hi >> shift) | ((cl_ulong) (exponent + 1023) << 52); + u.u = (hi >> shift) | ((cl_ulong)(exponent + 1023) << 52); // put a representation of the residual bits into hi - hi <<= (64-shift); + hi <<= (64 - shift); hi |= lo >> shift; - lo <<= (64-shift ); + lo <<= (64 - shift); hi |= lo != 0; - //round to nearest, ties to even - if( hi < 0x8000000000000000ULL ) return u.d; - if( hi == 0x8000000000000000ULL ) u.u += u.u & 1ULL; - else u.u++; + // round to nearest, ties to even + if (hi < 0x8000000000000000ULL) return u.d; + if (hi == 0x8000000000000000ULL) + u.u += u.u & 1ULL; + else + u.u++; return u.d; } -// Shift right. Bits lost on the right will be OR'd together and OR'd with the LSB -static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift ); -static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift ) +// Shift right. Bits lost on the right will be OR'd together and OR'd with the +// LSB +static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, + int shift); +static inline void shift_right_sticky_128(cl_ulong *hi, cl_ulong *lo, int shift) { cl_ulong sticky = 0; cl_ulong h = *hi; cl_ulong l = *lo; - if( shift >= 64 ) + if (shift >= 64) { shift -= 64; sticky = 0 != lo; l = h; h = 0; - if( shift >= 64 ) + if (shift >= 64) { sticky |= (0 != l); l = 0; } else { - sticky |= (0 != (l << (64-shift))); + sticky |= (0 != (l << (64 - shift))); l >>= shift; } } else { - sticky |= (0 != (l << (64-shift))); + sticky |= (0 != (l << (64 - shift))); l >>= shift; - l |= h << (64-shift); + l |= h << (64 - shift); h >>= shift; } @@ -2487,9 +2884,12 @@ static inline void shift_right_sticky_128( cl_ulong *hi, cl_ulong *lo, int shift } // 128-bit add of ((*hi << 64) + *lo) + ((chi << 64) + clo) -// If the 129 bit result doesn't fit, bits lost off the right end will be OR'd with the LSB -static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong clo, int *exp ); -static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong clo, int *exponent ) +// If the 129 bit result doesn't fit, bits lost off the right end will be OR'd +// with the LSB +static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi, + cl_ulong clo, int *exp); +static inline void add128(cl_ulong *hi, cl_ulong *lo, cl_ulong chi, + cl_ulong clo, int *exponent) { cl_ulong carry, carry2; // extended precision add @@ -2497,15 +2897,16 @@ static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong cl chi = add_carry(*hi, chi, &carry2); chi = add_carry(chi, carry, &carry); - //If we overflowed the 128 bit result - if( carry || carry2 ) + // If we overflowed the 128 bit result + if (carry || carry2) { - carry = clo & 1; // set aside low bit - clo >>= 1; // right shift low 1 - clo |= carry; // or back in the low bit, so we don't come to believe this is an exact half way case for rounding - clo |= chi << 63; // move lowest high bit into highest bit of lo - chi >>= 1; // right shift hi - chi |= 0x8000000000000000ULL; // move the carry bit into hi. + carry = clo & 1; // set aside low bit + clo >>= 1; // right shift low 1 + clo |= carry; // or back in the low bit, so we don't come to believe + // this is an exact half way case for rounding + clo |= chi << 63; // move lowest high bit into highest bit of lo + chi >>= 1; // right shift hi + chi |= 0x8000000000000000ULL; // move the carry bit into hi. *exponent = *exponent + 1; } @@ -2514,48 +2915,51 @@ static inline void add128( cl_ulong *hi, cl_ulong *lo, cl_ulong chi, cl_ulong cl } // 128-bit subtract of ((chi << 64) + clo) - ((*hi << 64) + *lo) -static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong lo, cl_ulong *signC, int *expC ); -static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong lo, cl_ulong *signC, int *expC ) +static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi, + cl_ulong lo, cl_ulong *signC, int *expC); +static inline void sub128(cl_ulong *chi, cl_ulong *clo, cl_ulong hi, + cl_ulong lo, cl_ulong *signC, int *expC) { cl_ulong rHi = *chi; cl_ulong rLo = *clo; cl_ulong carry, carry2; - //extended precision subtract + // extended precision subtract rLo = sub_carry(rLo, lo, &carry); rHi = sub_carry(rHi, hi, &carry2); rHi = sub_carry(rHi, carry, &carry); // Check for sign flip - if( carry || carry2 ) + if (carry || carry2) { *signC ^= 0x8000000000000000ULL; - //negate rLo, rHi: -x = (x ^ -1) + 1 + // negate rLo, rHi: -x = (x ^ -1) + 1 rLo ^= -1ULL; rHi ^= -1ULL; rLo++; rHi += 0 == rLo; } - // normalize -- move the most significant non-zero bit to the MSB, and adjust exponent accordingly - if( rHi == 0 ) + // normalize -- move the most significant non-zero bit to the MSB, and + // adjust exponent accordingly + if (rHi == 0) { rHi = rLo; *expC = *expC - 64; rLo = 0; } - if( rHi ) + if (rHi) { int shift = 32; cl_ulong test = 1ULL << 32; - while( 0 == (rHi & 0x8000000000000000ULL)) + while (0 == (rHi & 0x8000000000000000ULL)) { - if( rHi < test ) + if (rHi < test) { rHi <<= shift; - rHi |= rLo >> (64-shift); + rHi |= rLo >> (64 - shift); rLo <<= shift; *expC = *expC - shift; } @@ -2565,7 +2969,7 @@ static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong l } else { - //zero + // zero *expC = INT_MIN; *signC = 0; } @@ -2575,7 +2979,7 @@ static inline void sub128( cl_ulong *chi, cl_ulong *clo, cl_ulong hi, cl_ulong l *clo = rLo; } -long double reference_fmal( long double x, long double y, long double z) +long double reference_fmal(long double x, long double y, long double z) { static const cl_ulong kMSB = 0x8000000000000000ULL; @@ -2585,75 +2989,91 @@ long double reference_fmal( long double x, long double y, long double z) double c = z; // Make bits accessible - union{ cl_ulong u; cl_double d; } ua; ua.d = a; - union{ cl_ulong u; cl_double d; } ub; ub.d = b; - union{ cl_ulong u; cl_double d; } uc; uc.d = c; + union { + cl_ulong u; + cl_double d; + } ua; + ua.d = a; + union { + cl_ulong u; + cl_double d; + } ub; + ub.d = b; + union { + cl_ulong u; + cl_double d; + } uc; + uc.d = c; // deal with Nans, infinities and zeros - if( isnan( a ) || isnan( b ) || isnan(c) || - isinf( a ) || isinf( b ) || isinf(c) || - 0 == ( ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior - 0 == ( ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior - 0 == ( uc.u & ~kMSB) ) // c == 0, defeat host FTZ behavior + if (isnan(a) || isnan(b) || isnan(c) || isinf(a) || isinf(b) || isinf(c) + || 0 == (ua.u & ~kMSB) || // a == 0, defeat host FTZ behavior + 0 == (ub.u & ~kMSB) || // b == 0, defeat host FTZ behavior + 0 == (uc.u & ~kMSB)) // c == 0, defeat host FTZ behavior { - if( isinf( c ) && !isinf(a) && !isinf(b) ) - return (c + a) + b; + if (isinf(c) && !isinf(a) && !isinf(b)) return (c + a) + b; - a = (double) reference_multiplyl( a, b ); // some risk that the compiler will insert a non-compliant fma here on some platforms. - return reference_addl(a, c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that. + a = (double)reference_multiplyl( + a, b); // some risk that the compiler will insert a non-compliant + // fma here on some platforms. + return reference_addl( + a, + c); // We use STDC FP_CONTRACT OFF above to attempt to defeat that. } // extract exponent and mantissa // exponent is a standard unbiased signed integer // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB cl_ulong mantA, mantB, mantC; - int expA = extract( a, &mantA ); - int expB = extract( b, &mantB ); - int expC = extract( c, &mantC ); - cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to decide if we are adding or subtracting + int expA = extract(a, &mantA); + int expB = extract(b, &mantB); + int expC = extract(c, &mantC); + cl_ulong signC = uc.u & kMSB; // We'll need the sign bit of C later to + // decide if we are adding or subtracting -// exact product of A and B + // exact product of A and B int exponent = expA + expB; cl_ulong sign = (ua.u ^ ub.u) & kMSB; cl_ulong hi, lo; - mul128( mantA, mantB, &hi, &lo ); + mul128(mantA, mantB, &hi, &lo); // renormalize - if( 0 == (kMSB & hi) ) + if (0 == (kMSB & hi)) { hi <<= 1; hi |= lo >> 63; lo <<= 1; } else - exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our exponent increased. + exponent++; // 2**63 * 2**63 gives 2**126. If the MSB was set, then our + // exponent increased. -//infinite precision add + // infinite precision add cl_ulong chi = mantC; cl_ulong clo = 0; - if( exponent >= expC ) + if (exponent >= expC) { // Normalize C relative to the product - if( exponent > expC ) - shift_right_sticky_128( &chi, &clo, exponent - expC ); + if (exponent > expC) + shift_right_sticky_128(&chi, &clo, exponent - expC); // Add - if( sign ^ signC ) - sub128( &hi, &lo, chi, clo, &sign, &exponent ); + if (sign ^ signC) + sub128(&hi, &lo, chi, clo, &sign, &exponent); else - add128( &hi, &lo, chi, clo, &exponent ); + add128(&hi, &lo, chi, clo, &exponent); } else { // Shift the product relative to C so that their exponents match - shift_right_sticky_128( &hi, &lo, expC - exponent ); + shift_right_sticky_128(&hi, &lo, expC - exponent); // add - if( sign ^ signC ) - sub128( &chi, &clo, hi, lo, &signC, &expC ); + if (sign ^ signC) + sub128(&chi, &clo, hi, lo, &signC, &expC); else - add128( &chi, &clo, hi, lo, &expC ); + add128(&chi, &clo, hi, lo, &expC); hi = chi; lo = clo; @@ -2671,61 +3091,54 @@ long double reference_fmal( long double x, long double y, long double z) } - - -long double reference_madl( long double a, long double b, long double c) { return a * b + c; } - -//long double my_nextafterl(long double x, long double y){ return (long double) nextafter( (double) x, (double) y ); } - -long double reference_recipl( long double x){ return 1.0L / x; } - -long double reference_rootnl( long double x, int i) +long double reference_madl(long double a, long double b, long double c) { - double hi, lo; + return a * b + c; +} + +// long double my_nextafterl(long double x, long double y){ return (long +// double) nextafter( (double) x, (double) y ); } + +long double reference_recipl(long double x) { return 1.0L / x; } + +long double reference_rootnl(long double x, int i) +{ + double hi, lo; long double l; - //rootn ( x, 0 ) returns a NaN. - if( 0 == i ) - return cl_make_nan(); + // rootn ( x, 0 ) returns a NaN. + if (0 == i) return cl_make_nan(); - //rootn ( x, n ) returns a NaN for x < 0 and n is even. - if( x < 0.0L && 0 == (i&1) ) - return cl_make_nan(); + // rootn ( x, n ) returns a NaN for x < 0 and n is even. + if (x < 0.0L && 0 == (i & 1)) return cl_make_nan(); - if( isinf(x) ) + if (isinf(x)) { - if( i < 0 ) - return reference_copysignl(0.0L, x); + if (i < 0) return reference_copysignl(0.0L, x); return x; } - if( x == 0.0 ) + if (x == 0.0) { - switch( i & 0x80000001 ) + switch (i & 0x80000001) { - //rootn ( +-0, n ) is +0 for even n > 0. - case 0: - return 0.0L; + // rootn ( +-0, n ) is +0 for even n > 0. + case 0: return 0.0L; - //rootn ( +-0, n ) is +-0 for odd n > 0. - case 1: - return x; + // rootn ( +-0, n ) is +-0 for odd n > 0. + case 1: return x; - //rootn ( +-0, n ) is +inf for even n < 0. - case 0x80000000: - return INFINITY; + // rootn ( +-0, n ) is +inf for even n < 0. + case 0x80000000: return INFINITY; - //rootn ( +-0, n ) is +-inf for odd n < 0. - case 0x80000001: - return copysign(INFINITY, x); + // rootn ( +-0, n ) is +-inf for odd n < 0. + case 0x80000001: return copysign(INFINITY, x); } } - if( i == 1 ) - return x; + if (i == 1) return x; - if( i == -1 ) - return 1.0 / x; + if (i == -1) return 1.0 / x; long double sign = x; x = reference_fabsl(x); @@ -2733,167 +3146,174 @@ long double reference_rootnl( long double x, int i) DivideDD(&iHi, &iLo, 1.0, i); x = reference_powl(x, iHi) * reference_powl(x, iLo); - return reference_copysignl( x, sign ); - + return reference_copysignl(x, sign); } -long double reference_rsqrtl( long double x){ return 1.0L / sqrtl(x); } -//long double reference_sincosl( long double x, long double *c ){ *c = reference_cosl(x); return reference_sinl(x); } -long double reference_sinpil( long double x) +long double reference_rsqrtl(long double x) { return 1.0L / sqrtl(x); } +// long double reference_sincosl( long double x, long double *c ){ *c = +// reference_cosl(x); return reference_sinl(x); } +long double reference_sinpil(long double x) { double r = reduce1l(x); // reduce to [-0.5, 0.5] - if( r < -0.5L ) + if (r < -0.5L) r = -1.0L - r; - else if ( r > 0.5L ) + else if (r > 0.5L) r = 1.0L - r; // sinPi zeros have the same sign as x - if( r == 0.0L ) - return reference_copysignl(0.0L, x); + if (r == 0.0L) return reference_copysignl(0.0L, x); - return reference_sinl( r * M_PIL ); + return reference_sinl(r * M_PIL); } -long double reference_tanpil( long double x) +long double reference_tanpil(long double x) { // set aside the sign (allows us to preserve sign of -0) - long double sign = reference_copysignl( 1.0L, x); + long double sign = reference_copysignl(1.0L, x); long double z = reference_fabsl(x); // if big and even -- caution: only works if x only has single precision - if( z >= HEX_LDBL( +, 1, 0, +, 53 ) ) + if (z >= HEX_LDBL(+, 1, 0, +, 53)) { - if( z == INFINITY ) - return x - x; // nan + if (z == INFINITY) return x - x; // nan - return reference_copysignl( 0.0L, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n. + return reference_copysignl( + 0.0L, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n. } // reduce to the range [ -0.5, 0.5 ] - long double nearest = reference_rintl( z ); // round to nearest even places n + 0.5 values in the right place for us - int64_t i = (int64_t) nearest; // test above against 0x1.0p53 avoids overflow here + long double nearest = + reference_rintl(z); // round to nearest even places n + 0.5 values in + // the right place for us + int64_t i = + (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here z -= nearest; - //correction for odd integer x for the right sign of zero - if( (i&1) && z == 0.0L ) - sign = -sign; + // correction for odd integer x for the right sign of zero + if ((i & 1) && z == 0.0L) sign = -sign; // track changes to the sign - sign *= reference_copysignl(1.0L, z); // really should just be an xor - z = reference_fabsl(z); // remove the sign again + sign *= reference_copysignl(1.0L, z); // really should just be an xor + z = reference_fabsl(z); // remove the sign again // reduce once more - // If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly - if( z > 0.25L ) + // If we don't do this, rounding error in z * M_PI will cause us not to + // return infinities properly + if (z > 0.25L) { z = 0.5L - z; - return sign / reference_tanl( z * M_PIL ); // use system tan to get the right result + return sign + / reference_tanl(z + * M_PIL); // use system tan to get the right result } // - return sign * reference_tanl( z * M_PIL ); // use system tan to get the right result + return sign + * reference_tanl(z * M_PIL); // use system tan to get the right result } -long double reference_pownl( long double x, int i ){ return reference_powl( x, (long double) i ); } - -long double reference_powrl( long double x, long double y ) +long double reference_pownl(long double x, int i) { - //powr ( x, y ) returns NaN for x < 0. - if( x < 0.0L ) - return cl_make_nan(); + return reference_powl(x, (long double)i); +} - //powr ( x, NaN ) returns the NaN for x >= 0. - //powr ( NaN, y ) returns the NaN. - if( isnan(x) || isnan(y) ) - return x + y; // Note: behavior different here than for pow(1,NaN), pow(NaN, 0) +long double reference_powrl(long double x, long double y) +{ + // powr ( x, y ) returns NaN for x < 0. + if (x < 0.0L) return cl_make_nan(); - if( x == 1.0L ) + // powr ( x, NaN ) returns the NaN for x >= 0. + // powr ( NaN, y ) returns the NaN. + if (isnan(x) || isnan(y)) + return x + y; // Note: behavior different here than for pow(1,NaN), + // pow(NaN, 0) + + if (x == 1.0L) { - //powr ( +1, +-inf ) returns NaN. - if( reference_fabsl(y) == INFINITY ) - return cl_make_nan(); + // powr ( +1, +-inf ) returns NaN. + if (reference_fabsl(y) == INFINITY) return cl_make_nan(); - //powr ( +1, y ) is 1 for finite y. (NaN handled above) + // powr ( +1, y ) is 1 for finite y. (NaN handled above) return 1.0L; } - if( y == 0.0L ) + if (y == 0.0L) { - //powr ( +inf, +-0 ) returns NaN. - //powr ( +-0, +-0 ) returns NaN. - if( x == 0.0L || x == INFINITY ) - return cl_make_nan(); + // powr ( +inf, +-0 ) returns NaN. + // powr ( +-0, +-0 ) returns NaN. + if (x == 0.0L || x == INFINITY) return cl_make_nan(); - //powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already handled above) + // powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already + // handled above) return 1.0L; } - if( x == 0.0L ) + if (x == 0.0L) { - //powr ( +-0, -inf) is +inf. - //powr ( +-0, y ) is +inf for finite y < 0. - if( y < 0.0L ) - return INFINITY; + // powr ( +-0, -inf) is +inf. + // powr ( +-0, y ) is +inf for finite y < 0. + if (y < 0.0L) return INFINITY; - //powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) + // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) return 0.0L; } - return reference_powl( x, y ); + return reference_powl(x, y); } -//long double my_fdiml( long double x, long double y){ return fdim( (double) x, (double) y ); } -long double reference_addl( long double x, long double y) +// long double my_fdiml( long double x, long double y){ return fdim( (double) x, +// (double) y ); } +long double reference_addl(long double x, long double y) { - volatile double a = (double) x; - volatile double b = (double) y; + volatile double a = (double)x; + volatile double b = (double)y; -#if defined( __SSE2__ ) +#if defined(__SSE2__) // defeat x87 - __m128d va = _mm_set_sd( (double) a ); - __m128d vb = _mm_set_sd( (double) b ); - va = _mm_add_sd( va, vb ); - _mm_store_sd( (double*) &a, va ); + __m128d va = _mm_set_sd((double)a); + __m128d vb = _mm_set_sd((double)b); + va = _mm_add_sd(va, vb); + _mm_store_sd((double *)&a, va); #else a += b; #endif - return (long double) a; + return (long double)a; } -long double reference_subtractl( long double x, long double y) +long double reference_subtractl(long double x, long double y) { - volatile double a = (double) x; - volatile double b = (double) y; + volatile double a = (double)x; + volatile double b = (double)y; -#if defined( __SSE2__ ) +#if defined(__SSE2__) // defeat x87 - __m128d va = _mm_set_sd( (double) a ); - __m128d vb = _mm_set_sd( (double) b ); - va = _mm_sub_sd( va, vb ); - _mm_store_sd( (double*) &a, va ); + __m128d va = _mm_set_sd((double)a); + __m128d vb = _mm_set_sd((double)b); + va = _mm_sub_sd(va, vb); + _mm_store_sd((double *)&a, va); #else a -= b; #endif - return (long double) a; + return (long double)a; } -long double reference_multiplyl( long double x, long double y) +long double reference_multiplyl(long double x, long double y) { - volatile double a = (double) x; - volatile double b = (double) y; + volatile double a = (double)x; + volatile double b = (double)y; -#if defined( __SSE2__ ) +#if defined(__SSE2__) // defeat x87 - __m128d va = _mm_set_sd( (double) a ); - __m128d vb = _mm_set_sd( (double) b ); - va = _mm_mul_sd( va, vb ); - _mm_store_sd( (double*) &a, va ); + __m128d va = _mm_set_sd((double)a); + __m128d vb = _mm_set_sd((double)b); + va = _mm_mul_sd(va, vb); + _mm_store_sd((double *)&a, va); #else a *= b; #endif - return (long double) a; + return (long double)a; } /*long double my_remquol( long double x, long double y, int *iptr ) @@ -2908,22 +3328,22 @@ long double reference_multiplyl( long double x, long double y) return remquo( (double) x, (double) y, iptr ); }*/ -long double reference_lgamma_rl( long double x, int *signp ) +long double reference_lgamma_rl(long double x, int *signp) { -// long double lgamma_val = (long double)reference_lgamma( (double)x ); -// *signp = signgam; + // long double lgamma_val = (long double)reference_lgamma( (double)x ); + // *signp = signgam; *signp = 0; return x; } -int reference_isequall( long double x, long double y){ return x == y; } -int reference_isfinitel( long double x){ return 0 != isfinite(x); } -int reference_isgreaterl( long double x, long double y){ return x > y; } -int reference_isgreaterequall( long double x, long double y){ return x >= y; } -int reference_isinfl( long double x){ return 0 != isinf(x); } -int reference_islessl( long double x, long double y){ return x < y; } -int reference_islessequall( long double x, long double y){ return x <= y; } +int reference_isequall(long double x, long double y) { return x == y; } +int reference_isfinitel(long double x) { return 0 != isfinite(x); } +int reference_isgreaterl(long double x, long double y) { return x > y; } +int reference_isgreaterequall(long double x, long double y) { return x >= y; } +int reference_isinfl(long double x) { return 0 != isinf(x); } +int reference_islessl(long double x, long double y) { return x < y; } +int reference_islessequall(long double x, long double y) { return x <= y; } #if defined(__INTEL_COMPILER) int reference_islessgreaterl(long double x, long double y) { @@ -2935,69 +3355,77 @@ int reference_islessgreaterl(long double x, long double y) return 0 != islessgreater(x, y); } #endif -int reference_isnanl( long double x){ return 0 != isnan( x ); } -int reference_isnormall( long double x){ return 0 != isnormal( (double) x ); } -int reference_isnotequall( long double x, long double y){ return x != y; } -int reference_isorderedl( long double x, long double y){ return x == x && y == y; } -int reference_isunorderedl( long double x, long double y){ return isnan(x) || isnan( y ); } -#if defined( __INTEL_COMPILER ) -int reference_signbitl( long double x){ return 0 != signbitl( x ); } +int reference_isnanl(long double x) { return 0 != isnan(x); } +int reference_isnormall(long double x) { return 0 != isnormal((double)x); } +int reference_isnotequall(long double x, long double y) { return x != y; } +int reference_isorderedl(long double x, long double y) +{ + return x == x && y == y; +} +int reference_isunorderedl(long double x, long double y) +{ + return isnan(x) || isnan(y); +} +#if defined(__INTEL_COMPILER) +int reference_signbitl(long double x) { return 0 != signbitl(x); } #else -int reference_signbitl( long double x){ return 0 != signbit( x ); } +int reference_signbitl(long double x) { return 0 != signbit(x); } #endif -long double reference_copysignl( long double x, long double y); -long double reference_roundl( long double x ); +long double reference_copysignl(long double x, long double y); +long double reference_roundl(long double x); long double reference_cbrtl(long double x); -long double reference_copysignl( long double x, long double y ) +long double reference_copysignl(long double x, long double y) { - // We hope that the long double to double conversion proceeds with sign fidelity, - // even for zeros and NaNs - union{ double d; cl_ulong u;}u; u.d = (double) y; + // We hope that the long double to double conversion proceeds with sign + // fidelity, even for zeros and NaNs + union { + double d; + cl_ulong u; + } u; + u.d = (double)y; x = reference_fabsl(x); - if( u.u >> 63 ) - x = -x; + if (u.u >> 63) x = -x; return x; } -long double reference_roundl( long double x ) +long double reference_roundl(long double x) { // Since we are just using this to verify double precision, we can // use the double precision copysign here #if defined(__MINGW32__) && defined(__x86_64__) long double absx = reference_fabsl(x); - if (absx < 0.5L) - return reference_copysignl(0.0L, x); + if (absx < 0.5L) return reference_copysignl(0.0L, x); #endif - return round( (double) x ); + return round((double)x); } -long double reference_truncl( long double x ) +long double reference_truncl(long double x) { // Since we are just using this to verify double precision, we can // use the double precision copysign here - return trunc( (double) x ); + return trunc((double)x); } static long double reference_scalblnl(long double x, long n); long double reference_cbrtl(long double x) { - double yhi = HEX_DBL( +, 1, 5555555555555, -, 2 ); - double ylo = HEX_DBL( +, 1, 558, -, 56 ); + double yhi = HEX_DBL(+, 1, 5555555555555, -, 2); + double ylo = HEX_DBL(+, 1, 558, -, 56); - double fabsx = reference_fabs( x ); + double fabsx = reference_fabs(x); - if( isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x) ) - return x; + if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x; double iy = 0.0; double log2x_hi, log2x_lo; - // extended precision log .... accurate to at least 64-bits + couple of guard bits + // extended precision log .... accurate to at least 64-bits + couple of + // guard bits __log2_ep(&log2x_hi, &log2x_lo, fabsx); double ylog2x_hi, ylog2x_lo; @@ -3009,20 +3437,24 @@ long double reference_cbrtl(long double x) MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo); long double powxy; - if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) { - powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY; - } else { + if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) + { + powxy = + reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY; + } + else + { // separate integer + fractional part long int m = lrint(ylog2x_hi); AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0); // revert to long double arithemtic - long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo; - powxy = reference_exp2l( ylog2x ); + long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo; + powxy = reference_exp2l(ylog2x); powxy = reference_scalblnl(powxy, m); } - return reference_copysignl( powxy, x ); + return reference_copysignl(powxy, x); } /* @@ -3064,24 +3496,24 @@ long double scalbnl( long double x, int i ) } */ -long double reference_rintl( long double x ) +long double reference_rintl(long double x) { #if defined(__PPC__) - // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined - // mantissa can represent more than LDBL_MANT_DIG binary digits. - x = rintl(x); + // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined + // mantissa can represent more than LDBL_MANT_DIG binary digits. + x = rintl(x); #else - static long double magic[2] = { 0.0L, 0.0L}; + static long double magic[2] = { 0.0L, 0.0L }; - if( 0.0L == magic[0] ) + if (0.0L == magic[0]) { magic[0] = scalbnl(0.5L, LDBL_MANT_DIG); magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG); } - if( reference_fabsl(x) < magic[0] && x != 0.0L ) + if (reference_fabsl(x) < magic[0] && x != 0.0L) { - long double m = magic[ x < 0 ]; + long double m = magic[x < 0]; x += m; x -= m; } @@ -3094,7 +3526,7 @@ long double reference_rintl( long double x ) static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo) { // approximate reciprocal sqrt - double thi = 1.0 / sqrt( xhi ); + double thi = 1.0 / sqrt(xhi); double tlo = 0.0; // One newton iteration in double-double @@ -3108,34 +3540,31 @@ static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo) MulDD(rhi, rlo, yhi, ylo, xhi, xlo); } -long double reference_acoshl( long double x ) +long double reference_acoshl(long double x) { -/* - * ==================================================== - * This function derived from fdlibm http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - * - */ - if( isnan(x) || isinf(x)) - return x + fabsl(x); + /* + * ==================================================== + * This function derived from fdlibm http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + * + */ + if (isnan(x) || isinf(x)) return x + fabsl(x); - if( x < 1.0L ) - return cl_make_nan(); + if (x < 1.0L) return cl_make_nan(); - if( x == 1.0L ) - return 0.0L; + if (x == 1.0L) return 0.0L; - if( x > HEX_LDBL( +, 1, 0, +, 60 ) ) + if (x > HEX_LDBL(+, 1, 0, +, 60)) return reference_logl(x) + 0.693147180559945309417232121458176568L; - if( x > 2.0L ) - return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x*x - 1.0L))); + if (x > 2.0L) + return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L))); double hi, lo; MulD(&hi, &lo, x, x); @@ -3144,286 +3573,301 @@ long double reference_acoshl( long double x ) AddDD(&hi, &lo, hi, lo, x, 0.0); double correction = lo / hi; __log2_ep(&hi, &lo, hi); - double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ); - double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 ); + double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1); + double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56); MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo); AddDD(&hi, &lo, hi, lo, correction, 0.0); return hi + lo; } -long double reference_asinhl( long double x ) +long double reference_asinhl(long double x) { long double cutoff = 0.0L; - const long double ln2 = HEX_LDBL( +, b, 17217f7d1cf79ab, -, 4 ); + const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4); - if( cutoff == 0.0L ) - cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG); + if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG); - if( isnan(x) || isinf(x) ) - return x + x; + if (isnan(x) || isinf(x)) return x + x; long double absx = reference_fabsl(x); - if( absx < cutoff ) - return x; + if (absx < cutoff) return x; long double sign = reference_copysignl(1.0L, x); - if( absx <= 4.0/3.0 ) { - return sign * reference_log1pl( absx + x*x / (1.0 + sqrtl(1.0 + x*x))); + if (absx <= 4.0 / 3.0) + { + return sign + * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x))); } - else if( absx <= HEX_LDBL( +, 1, 0, +, 27 ) ) { - return sign * reference_logl( 2.0L * absx + 1.0L / (sqrtl( x * x + 1.0 ) + absx)); + else if (absx <= HEX_LDBL(+, 1, 0, +, 27)) + { + return sign + * reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx)); } - else { - return sign * ( reference_logl( absx ) + ln2 ); + else + { + return sign * (reference_logl(absx) + ln2); } } -long double reference_atanhl( long double x ) +long double reference_atanhl(long double x) { -/* - * ==================================================== - * This function is from fdlibm: http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - if( isnan(x) ) - return x + x; + /* + * ==================================================== + * This function is from fdlibm: http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + if (isnan(x)) return x + x; - long double signed_half = reference_copysignl( 0.5L, x ); + long double signed_half = reference_copysignl(0.5L, x); x = reference_fabsl(x); - if( x > 1.0L ) - return cl_make_nan(); + if (x > 1.0L) return cl_make_nan(); - if( x < 0.5L ) - return signed_half * reference_log1pl( 2.0L * ( x + x*x / (1-x) ) ); + if (x < 0.5L) + return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x))); - return signed_half * reference_log1pl(2.0L * x / (1-x)); + return signed_half * reference_log1pl(2.0L * x / (1 - x)); } -long double reference_exp2l( long double z) +long double reference_exp2l(long double z) { double_double x; int j; // Handle NaNs - if( isnan(z) ) - return z; + if (isnan(z)) return z; // init x x.hi = z; x.lo = z - x.hi; - //Deal with overflow and underflow for exp2(x) stage next - if( x.hi >= 1025 ) - return INFINITY; + // Deal with overflow and underflow for exp2(x) stage next + if (x.hi >= 1025) return INFINITY; - if( x.hi < -1075-24 ) - return +0.0; + if (x.hi < -1075 - 24) return +0.0; // find nearest integer to x - int i = (int) rint(x.hi); + int i = (int)rint(x.hi); // x now holds fractional part. The result would be then 2**i * exp2( x ) x.hi -= i; - // We could attempt to find a minimax polynomial for exp2(x) over the range x = [-0.5, 0.5]. - // However, this would converge very slowly near the extrema, where 0.5**n is not a lot different - // from 0.5**(n+1), thereby requiring something like a 20th order polynomial to get 53 + 24 bits - // of precision. Instead we further reduce the range to [-1/32, 1/32] by observing that + // We could attempt to find a minimax polynomial for exp2(x) over the range + // x = [-0.5, 0.5]. However, this would converge very slowly near the + // extrema, where 0.5**n is not a lot different from 0.5**(n+1), thereby + // requiring something like a 20th order polynomial to get 53 + 24 bits of + // precision. Instead we further reduce the range to [-1/32, 1/32] by + // observing that // // 2**(a+b) = 2**a * 2**b // - // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and reduce the range - // of x to [-1/32, 1/32] by subtracting away the nearest value of n/16 from x. - const double_double corrections[17] = - { - { HEX_DBL( +, 1, 6a09e667f3bcd, -, 1 ), HEX_DBL( -, 1, bdd3413b26456, -, 55 ) }, - { HEX_DBL( +, 1, 7a11473eb0187, -, 1 ), HEX_DBL( -, 1, 41577ee04992f, -, 56 ) }, - { HEX_DBL( +, 1, 8ace5422aa0db, -, 1 ), HEX_DBL( +, 1, 6e9f156864b27, -, 55 ) }, - { HEX_DBL( +, 1, 9c49182a3f09, -, 1 ), HEX_DBL( +, 1, c7c46b071f2be, -, 57 ) }, - { HEX_DBL( +, 1, ae89f995ad3ad, -, 1 ), HEX_DBL( +, 1, 7a1cd345dcc81, -, 55 ) }, - { HEX_DBL( +, 1, c199bdd85529c, -, 1 ), HEX_DBL( +, 1, 11065895048dd, -, 56 ) }, - { HEX_DBL( +, 1, d5818dcfba487, -, 1 ), HEX_DBL( +, 1, 2ed02d75b3707, -, 56 ) }, - { HEX_DBL( +, 1, ea4afa2a490da, -, 1 ), HEX_DBL( -, 1, e9c23179c2893, -, 55 ) }, - { HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ) }, - { HEX_DBL( +, 1, 0b5586cf9890f, +, 0 ), HEX_DBL( +, 1, 8a62e4adc610b, -, 54 ) }, - { HEX_DBL( +, 1, 172b83c7d517b, +, 0 ), HEX_DBL( -, 1, 19041b9d78a76, -, 55 ) }, - { HEX_DBL( +, 1, 2387a6e756238, +, 0 ), HEX_DBL( +, 1, 9b07eb6c70573, -, 54 ) }, - { HEX_DBL( +, 1, 306fe0a31b715, +, 0 ), HEX_DBL( +, 1, 6f46ad23182e4, -, 55 ) }, - { HEX_DBL( +, 1, 3dea64c123422, +, 0 ), HEX_DBL( +, 1, ada0911f09ebc, -, 55 ) }, - { HEX_DBL( +, 1, 4bfdad5362a27, +, 0 ), HEX_DBL( +, 1, d4397afec42e2, -, 56 ) }, - { HEX_DBL( +, 1, 5ab07dd485429, +, 0 ), HEX_DBL( +, 1, 6324c054647ad, -, 54 ) }, - { HEX_DBL( +, 1, 6a09e667f3bcd, +, 0 ), HEX_DBL( -, 1, bdd3413b26456, -, 54 ) } + // We can thus build a table of 2**a values for a = n/16, n = [-8, 8], and + // reduce the range of x to [-1/32, 1/32] by subtracting away the nearest + // value of n/16 from x. + const double_double corrections[17] = { + { HEX_DBL(+, 1, 6a09e667f3bcd, -, 1), + HEX_DBL(-, 1, bdd3413b26456, -, 55) }, + { HEX_DBL(+, 1, 7a11473eb0187, -, 1), + HEX_DBL(-, 1, 41577ee04992f, -, 56) }, + { HEX_DBL(+, 1, 8ace5422aa0db, -, 1), + HEX_DBL(+, 1, 6e9f156864b27, -, 55) }, + { HEX_DBL(+, 1, 9c49182a3f09, -, 1), + HEX_DBL(+, 1, c7c46b071f2be, -, 57) }, + { HEX_DBL(+, 1, ae89f995ad3ad, -, 1), + HEX_DBL(+, 1, 7a1cd345dcc81, -, 55) }, + { HEX_DBL(+, 1, c199bdd85529c, -, 1), + HEX_DBL(+, 1, 11065895048dd, -, 56) }, + { HEX_DBL(+, 1, d5818dcfba487, -, 1), + HEX_DBL(+, 1, 2ed02d75b3707, -, 56) }, + { HEX_DBL(+, 1, ea4afa2a490da, -, 1), + HEX_DBL(-, 1, e9c23179c2893, -, 55) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, 0b5586cf9890f, +, 0), + HEX_DBL(+, 1, 8a62e4adc610b, -, 54) }, + { HEX_DBL(+, 1, 172b83c7d517b, +, 0), + HEX_DBL(-, 1, 19041b9d78a76, -, 55) }, + { HEX_DBL(+, 1, 2387a6e756238, +, 0), + HEX_DBL(+, 1, 9b07eb6c70573, -, 54) }, + { HEX_DBL(+, 1, 306fe0a31b715, +, 0), + HEX_DBL(+, 1, 6f46ad23182e4, -, 55) }, + { HEX_DBL(+, 1, 3dea64c123422, +, 0), + HEX_DBL(+, 1, ada0911f09ebc, -, 55) }, + { HEX_DBL(+, 1, 4bfdad5362a27, +, 0), + HEX_DBL(+, 1, d4397afec42e2, -, 56) }, + { HEX_DBL(+, 1, 5ab07dd485429, +, 0), + HEX_DBL(+, 1, 6324c054647ad, -, 54) }, + { HEX_DBL(+, 1, 6a09e667f3bcd, +, 0), + HEX_DBL(-, 1, bdd3413b26456, -, 54) } }; - int index = (int) rint( x.hi * 16.0 ); - x.hi -= (double) index * 0.0625; + int index = (int)rint(x.hi * 16.0); + x.hi -= (double)index * 0.0625; // canonicalize x double temp = x.hi; x.hi += x.lo; x.lo -= x.hi - temp; - // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max Error: 2 * 0x1.e112p-87 - const double_double c[] = { - {HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ), HEX_DBL( +, 1, abc9e3ac1d244, -, 56 )}, - {HEX_DBL( +, 1, ebfbdff82c58f, -, 3 ), HEX_DBL( -, 1, 5e4987a631846, -, 57 )}, - {HEX_DBL( +, 1, c6b08d704a0c, -, 5 ), HEX_DBL( -, 1, d323200a05713, -, 59 )}, - {HEX_DBL( +, 1, 3b2ab6fba4e7a, -, 7 ), HEX_DBL( +, 1, c5ee8f8b9f0c1, -, 63 )}, - {HEX_DBL( +, 1, 5d87fe78a672a, -, 10 ), HEX_DBL( +, 1, 884e5e5cc7ecc, -, 64 )}, - {HEX_DBL( +, 1, 430912f7e8373, -, 13 ), HEX_DBL( +, 1, 4f1b59514a326, -, 67 )}, - {HEX_DBL( +, 1, ffcbfc5985e71, -, 17 ), HEX_DBL( -, 1, db7d6a0953b78, -, 71 )}, - {HEX_DBL( +, 1, 62c150eb16465, -, 20 ), HEX_DBL( +, 1, e0767c2d7abf5, -, 80 )}, - {HEX_DBL( +, 1, b52502b5e953, -, 24 ), HEX_DBL( +, 1, 6797523f944bc, -, 78 )} - }; - size_t count = sizeof( c ) / sizeof( c[0] ); + // Minimax polynomial for (exp2(x)-1)/x, over the range [-1/32, 1/32]. Max + // Error: 2 * 0x1.e112p-87 + const double_double c[] = { { HEX_DBL(+, 1, 62e42fefa39ef, -, 1), + HEX_DBL(+, 1, abc9e3ac1d244, -, 56) }, + { HEX_DBL(+, 1, ebfbdff82c58f, -, 3), + HEX_DBL(-, 1, 5e4987a631846, -, 57) }, + { HEX_DBL(+, 1, c6b08d704a0c, -, 5), + HEX_DBL(-, 1, d323200a05713, -, 59) }, + { HEX_DBL(+, 1, 3b2ab6fba4e7a, -, 7), + HEX_DBL(+, 1, c5ee8f8b9f0c1, -, 63) }, + { HEX_DBL(+, 1, 5d87fe78a672a, -, 10), + HEX_DBL(+, 1, 884e5e5cc7ecc, -, 64) }, + { HEX_DBL(+, 1, 430912f7e8373, -, 13), + HEX_DBL(+, 1, 4f1b59514a326, -, 67) }, + { HEX_DBL(+, 1, ffcbfc5985e71, -, 17), + HEX_DBL(-, 1, db7d6a0953b78, -, 71) }, + { HEX_DBL(+, 1, 62c150eb16465, -, 20), + HEX_DBL(+, 1, e0767c2d7abf5, -, 80) }, + { HEX_DBL(+, 1, b52502b5e953, -, 24), + HEX_DBL(+, 1, 6797523f944bc, -, 78) } }; + size_t count = sizeof(c) / sizeof(c[0]); // Do polynomial - double_double r = c[count-1]; - for( j = (int) count-2; j >= 0; j-- ) - r = add_dd( c[j], mul_dd( r, x ) ); + double_double r = c[count - 1]; + for (j = (int)count - 2; j >= 0; j--) r = add_dd(c[j], mul_dd(r, x)); // unwind approximation - r = mul_dd( r, x ); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1 + r = mul_dd(r, x); // before: r =(exp2(x)-1)/x; after: r = exp2(x) - 1 // correct for [-0.5, 0.5] -> [-1/32, 1/32] reduction above // exp2(x) = (r + 1) * correction = r * correction + correction - r = mul_dd( r, corrections[index+8] ); - r = add_dd( r, corrections[index+8] ); + r = mul_dd(r, corrections[index + 8]); + r = add_dd(r, corrections[index + 8]); -// Format result for output: + // Format result for output: // Get mantissa - long double m = ((long double) r.hi + (long double) r.lo ); + long double m = ((long double)r.hi + (long double)r.lo); // Handle a pesky overflow cases when long double = double - if( i > 512 ) + if (i > 512) { - m *= HEX_DBL( +, 1, 0, +, 512 ); + m *= HEX_DBL(+, 1, 0, +, 512); i -= 512; } - else if( i < -512 ) + else if (i < -512) { - m *= HEX_DBL( +, 1, 0, -, 512 ); + m *= HEX_DBL(+, 1, 0, -, 512); i += 512; } - return m * ldexpl( 1.0L, i ); + return m * ldexpl(1.0L, i); } -long double reference_expm1l( long double x) +long double reference_expm1l(long double x) { -#if defined( _MSC_VER ) && ! defined( __INTEL_COMPILER ) - //unimplemented +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + // unimplemented return x; #else - union { double f; cl_ulong u;} u; - u.f = (double) x; + union { + double f; + cl_ulong u; + } u; + u.f = (double)x; - if (reference_isnanl(x)) - return x; + if (reference_isnanl(x)) return x; - if ( x > 710 ) - return INFINITY; + if (x > 710) return INFINITY; long double y = expm1l(x); // Range of expm1l is -1.0L to +inf. Negative inf // on a few Linux platforms is clearly the wrong sign. - if (reference_isinfl(y)) - y = INFINITY; + if (reference_isinfl(y)) y = INFINITY; return y; #endif } -long double reference_fmaxl( long double x, long double y ) +long double reference_fmaxl(long double x, long double y) { - if( isnan(y) ) - return x; + if (isnan(y)) return x; return x >= y ? x : y; } -long double reference_fminl( long double x, long double y ) +long double reference_fminl(long double x, long double y) { - if( isnan(y) ) - return x; + if (isnan(y)) return x; return x <= y ? x : y; } -long double reference_hypotl( long double x, long double y ) +long double reference_hypotl(long double x, long double y) { - static const double tobig = HEX_DBL( +, 1, 0, +, 511 ); - static const double big = HEX_DBL( +, 1, 0, +, 513 ); - static const double rbig = HEX_DBL( +, 1, 0, -, 513 ); - static const double tosmall = HEX_DBL( +, 1, 0, -, 511 ); - static const double smalll = HEX_DBL( +, 1, 0, -, 607 ); - static const double rsmall = HEX_DBL( +, 1, 0, +, 607 ); + static const double tobig = HEX_DBL(+, 1, 0, +, 511); + static const double big = HEX_DBL(+, 1, 0, +, 513); + static const double rbig = HEX_DBL(+, 1, 0, -, 513); + static const double tosmall = HEX_DBL(+, 1, 0, -, 511); + static const double smalll = HEX_DBL(+, 1, 0, -, 607); + static const double rsmall = HEX_DBL(+, 1, 0, +, 607); long double max, min; - if( isinf(x) || isinf(y) ) - return INFINITY; + if (isinf(x) || isinf(y)) return INFINITY; - if( isnan(x) || isnan(y) ) - return x + y; + if (isnan(x) || isnan(y)) return x + y; x = reference_fabsl(x); y = reference_fabsl(y); - max = reference_fmaxl( x, y ); - min = reference_fminl( x, y ); + max = reference_fmaxl(x, y); + min = reference_fminl(x, y); - if( max > tobig ) + if (max > tobig) { max *= rbig; min *= rbig; - return big * sqrtl( max * max + min * min ); + return big * sqrtl(max * max + min * min); } - if( max < tosmall ) + if (max < tosmall) { max *= rsmall; min *= rsmall; - return smalll * sqrtl( max * max + min * min ); + return smalll * sqrtl(max * max + min * min); } - return sqrtl( x * x + y * y ); + return sqrtl(x * x + y * y); } -//long double reference_log2l( long double x ) +// long double reference_log2l( long double x ) //{ // return log( x ) * 1.44269504088896340735992468100189214L; //} -long double reference_log2l( long double x ) +long double reference_log2l(long double x) { - if( isnan(x) || x < 0.0 || x == -INFINITY) - return NAN; + if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN; - if( x == 0.0f) - return -INFINITY; + if (x == 0.0f) return -INFINITY; - if( x == INFINITY ) - return INFINITY; + if (x == INFINITY) return INFINITY; double hi, lo; - __log2_ep( &hi, &lo, x); + __log2_ep(&hi, &lo, x); - return (long double) hi + (long double) lo; + return (long double)hi + (long double)lo; } -long double reference_log1pl( long double x) +long double reference_log1pl(long double x) { -#if defined( _MSC_VER ) && ! defined( __INTEL_COMPILER ) - //unimplemented +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) + // unimplemented return x; #elif defined(__PPC__) // log1pl on PPC inadvertantly returns NaN for very large values. Work @@ -3434,23 +3878,24 @@ long double reference_log1pl( long double x) #endif } -long double reference_logbl( long double x ) +long double reference_logbl(long double x) { // Since we are just using this to verify double precision, we can // use the double precision copysign here - union { double f; cl_ulong u;} u; - u.f = (double) x; + union { + double f; + cl_ulong u; + } u; + u.f = (double)x; cl_int exponent = (cl_uint)(u.u >> 52) & 0x7ff; - if( exponent == 0x7ff ) - return x * x; + if (exponent == 0x7ff) return x * x; - if( exponent == 0 ) - { // deal with denormals - u.f = x * HEX_DBL( +, 1, 0, +, 64 ); + if (exponent == 0) + { // deal with denormals + u.f = x * HEX_DBL(+, 1, 0, +, 64); exponent = (cl_int)(u.u >> 52) & 0x7ff; - if( exponent == 0 ) - return -INFINITY; + if (exponent == 0) return -INFINITY; return exponent - (1023 + 64); } @@ -3458,84 +3903,84 @@ long double reference_logbl( long double x ) return exponent - 1023; } -long double reference_maxmagl( long double x, long double y ) +long double reference_maxmagl(long double x, long double y) { long double fabsx = fabsl(x); long double fabsy = fabsl(y); - if( fabsx < fabsy ) - return y; + if (fabsx < fabsy) return y; - if( fabsy < fabsx ) - return x; + if (fabsy < fabsx) return x; return reference_fmaxl(x, y); } -long double reference_minmagl( long double x, long double y ) +long double reference_minmagl(long double x, long double y) { long double fabsx = fabsl(x); long double fabsy = fabsl(y); - if( fabsx > fabsy ) - return y; + if (fabsx > fabsy) return y; - if( fabsy > fabsx ) - return x; + if (fabsy > fabsx) return x; return reference_fminl(x, y); } -long double reference_nanl( cl_ulong x ) +long double reference_nanl(cl_ulong x) { - union{ cl_ulong u; cl_double f; }u; + union { + cl_ulong u; + cl_double f; + } u; u.u = x | 0x7ff8000000000000ULL; - return (long double) u.f; + return (long double)u.f; } -long double reference_reciprocall( long double x ) -{ - return 1.0L / x; -} +long double reference_reciprocall(long double x) { return 1.0L / x; } -long double reference_remainderl( long double x, long double y ); -long double reference_remainderl( long double x, long double y ) +long double reference_remainderl(long double x, long double y); +long double reference_remainderl(long double x, long double y) { int i; - return reference_remquol( x, y, &i ); + return reference_remquol(x, y, &i); } -long double reference_lgammal( long double x); -long double reference_lgammal( long double x) +long double reference_lgammal(long double x); +long double reference_lgammal(long double x) { // lgamma is currently not tested - return reference_lgamma( x ); + return reference_lgamma(x); } -static uint32_t two_over_pi[] = { 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5, 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9, -0x6e3a424, 0x374b8019, 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121, 0xe99c702, 0x1ad17df9, -0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4, 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9, -0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7, 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, -0x3046fc7b, 0x1aeafc33, 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102, 0xdffd880, 0x135cc9cc, -0x10606155 +static uint32_t two_over_pi[] = { + 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5, + 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9, 0x6e3a424, 0x374b8019, + 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121, + 0xe99c702, 0x1ad17df9, 0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4, + 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9, + 0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7, + 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33, + 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102, + 0xdffd880, 0x135cc9cc, 0x10606155 }; -static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846, 0x26263314, 0x1701b839, 0x28948127 }; +static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846, + 0x26263314, 0x1701b839, 0x28948127 }; -typedef union - { - uint64_t u; - double d; - }d_ui64_t; +typedef union { + uint64_t u; + double d; +} d_ui64_t; // radix or base of representation #define RADIX (30) #define DIGITS 6 -d_ui64_t two_pow_pradix = { (uint64_t) (1023 + RADIX) << 52 }; -d_ui64_t two_pow_mradix = { (uint64_t) (1023 - RADIX) << 52 }; -d_ui64_t two_pow_two_mradix = { (uint64_t) (1023-2*RADIX) << 52 }; +d_ui64_t two_pow_pradix = { (uint64_t)(1023 + RADIX) << 52 }; +d_ui64_t two_pow_mradix = { (uint64_t)(1023 - RADIX) << 52 }; +d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 }; #define tp_pradix two_pow_pradix.d #define tp_mradix two_pow_mradix.d @@ -3544,11 +3989,12 @@ d_ui64_t two_pow_two_mradix = { (uint64_t) (1023-2*RADIX) << 52 }; // floating point number. // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ] typedef struct - { - uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in base_30 - int index; // exponent bias - int sign; // sign of double - }eprep_t; +{ + uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in + // base_30 + int index; // exponent bias + int sign; // sign of double +} eprep_t; static eprep_t double_to_eprep(double x); @@ -3556,15 +4002,17 @@ static eprep_t double_to_eprep(double x) { eprep_t result; - result.sign = (signbit( x ) == 0) ? 1 : -1; - x = fabs( x ); + result.sign = (signbit(x) == 0) ? 1 : -1; + x = fabs(x); int index = 0; - while( x > tp_pradix ) { + while (x > tp_pradix) + { index++; x *= tp_mradix; } - while( x < 1 ) { + while (x < 1) + { index--; x *= tp_pradix; } @@ -3572,9 +4020,10 @@ static eprep_t double_to_eprep(double x) result.index = index; int i = 0; result.X[0] = result.X[1] = result.X[2] = 0; - while( x != 0.0 ) { - result.X[i] = (uint32_t) x; - x = (x - (double) result.X[i]) * tp_pradix; + while (x != 0.0) + { + result.X[i] = (uint32_t)x; + x = (x - (double)result.X[i]) * tp_pradix; i++; } return result; @@ -3660,102 +4109,120 @@ static eprep_t double_to_eprep(double x) return sgn*res; } */ -static double eprep_to_double( eprep_t epx ); +static double eprep_to_double(eprep_t epx); -static double eprep_to_double( eprep_t epx ) +static double eprep_to_double(eprep_t epx) { double res = 0.0; - res += ldexp((double) epx.X[0], (epx.index - 0)*RADIX); - res += ldexp((double) epx.X[1], (epx.index - 1)*RADIX); - res += ldexp((double) epx.X[2], (epx.index - 2)*RADIX); + res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX); + res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX); + res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX); return copysign(res, epx.sign); } -static int payne_hanek( double *y, int *exception ); +static int payne_hanek(double *y, int *exception); -static int payne_hanek( double *y, int *exception ) +static int payne_hanek(double *y, int *exception) { double x = *y; // exception cases .. no reduction required - if( isnan( x ) || isinf( x ) || (fabs( x ) <= M_PI_4) ) { + if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4)) + { *exception = 1; return 0; } *exception = 0; - // After computation result[0] contains integer part while result[1]....result[DIGITS-1] - // contain fractional part. So we are doing computation with (DIGITS-1)*RADIX precision. - // Default DIGITS=6 and RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm - // shows that a double precision x, closest to pi/2 is 6381956970095103 x 2^797 which can - // cause 61 digits of cancellation in computation of f = x*2/pi - floor(x*2/pi) ... thus we need - // at least 114 bits (61 leading zeros + 53 bits of mentissa of f) of precision to accurately compute - // f in double precision. Since we are using 150 bits (still an overkill), we should be safe. Extra - // bits can act as guard bits for correct rounding. - uint64_t result[DIGITS+2]; + // After computation result[0] contains integer part while + // result[1]....result[DIGITS-1] contain fractional part. So we are doing + // computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 and + // RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm shows + // that a double precision x, closest to pi/2 is 6381956970095103 x 2^797 + // which can cause 61 digits of cancellation in computation of f = x*2/pi - + // floor(x*2/pi) ... thus we need at least 114 bits (61 leading zeros + 53 + // bits of mentissa of f) of precision to accurately compute f in double + // precision. Since we are using 150 bits (still an overkill), we should be + // safe. Extra bits can act as guard bits for correct rounding. + uint64_t result[DIGITS + 2]; // compute extended precision representation of x - eprep_t epx = double_to_eprep( x ); + eprep_t epx = double_to_eprep(x); int index = epx.index; int i, j; - // extended precision multiplication of 2/pi*x .... we will loose at max two RADIX=30 bit digits in - // the worst case - for(i = 0; i < (DIGITS+2); i++) { + // extended precision multiplication of 2/pi*x .... we will loose at max two + // RADIX=30 bit digits in the worst case + for (i = 0; i < (DIGITS + 2); i++) + { result[i] = 0; - result[i] += ((index + i - 0) >= 0) ? ((uint64_t) two_over_pi[index + i - 0] * (uint64_t) epx.X[0]) : 0; - result[i] += ((index + i - 1) >= 0) ? ((uint64_t) two_over_pi[index + i - 1] * (uint64_t) epx.X[1]) : 0; - result[i] += ((index + i - 2) >= 0) ? ((uint64_t) two_over_pi[index + i - 2] * (uint64_t) epx.X[2]) : 0; + result[i] += ((index + i - 0) >= 0) + ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0]) + : 0; + result[i] += ((index + i - 1) >= 0) + ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1]) + : 0; + result[i] += ((index + i - 2) >= 0) + ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2]) + : 0; } // Carry propagation. uint64_t tmp; - for(i = DIGITS+2-1; i > 0; i--) { + for (i = DIGITS + 2 - 1; i > 0; i--) + { tmp = result[i] >> RADIX; result[i - 1] += tmp; result[i] -= (tmp << RADIX); } - // we dont ned to normalize the integer part since only last two bits of this will be used - // subsequently algorithm which remain unaltered by this normalization. - // tmp = result[0] >> RADIX; - // result[0] -= (tmp << RADIX); - unsigned int N = (unsigned int) result[0]; + // we dont ned to normalize the integer part since only last two bits of + // this will be used subsequently algorithm which remain unaltered by this + // normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << RADIX); + unsigned int N = (unsigned int)result[0]; - // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that testing if the final - // x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to testing, at this stage, if r[1] (the first fractional - // digit) is greater than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to mentioned - // range is equivalent to substracting fractional part at this stage from one and changing the sign. + // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that + // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to + // testing, at this stage, if r[1] (the first fractional digit) is greater + // than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to + // mentioned range is equivalent to substracting fractional part at this + // stage from one and changing the sign. int sign = 1; - if(result[1] > (uint64_t)(1 << (RADIX - 1))) { - for(i = 1; i < (DIGITS + 2); i++) + if (result[1] > (uint64_t)(1 << (RADIX - 1))) + { + for (i = 1; i < (DIGITS + 2); i++) result[i] = (~((unsigned int)result[i]) & 0x3fffffff); N += 1; sign = -1; } - // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in the worst case - // (when x is multiple of 2/pi very close to an integer) so we need to get rid of these zeros - // and adjust the index of final result. So in the worst case, precision of comupted result is - // 90 bits (150 bits original bits - 60 lost in cancellation). + // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in + // the worst case (when x is multiple of 2/pi very close to an integer) so + // we need to get rid of these zeros and adjust the index of final result. + // So in the worst case, precision of comupted result is 90 bits (150 bits + // original bits - 60 lost in cancellation). int ind = 1; - for(i = 1; i < (DIGITS+2); i++) { - if(result[i] != 0) + for (i = 1; i < (DIGITS + 2); i++) + { + if (result[i] != 0) break; else ind++; } - uint64_t r[DIGITS-1]; - for(i = 0; i < (DIGITS-1); i++) { + uint64_t r[DIGITS - 1]; + for (i = 0; i < (DIGITS - 1); i++) + { r[i] = 0; - for(j = 0; j <= i; j++) { - r[i] += (result[ind+i-j] * (uint64_t) pi_over_two[j]); + for (j = 0; j <= i; j++) + { + r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]); } } - for(i = (DIGITS-2); i > 0; i--) { + for (i = (DIGITS - 2); i > 0; i--) + { tmp = r[i] >> RADIX; r[i - 1] += tmp; r[i] -= (tmp << RADIX); @@ -3764,147 +4231,127 @@ static int payne_hanek( double *y, int *exception ) r[0] -= (tmp << RADIX); eprep_t epr; - epr.sign = epx.sign*sign; - if(tmp != 0) { + epr.sign = epx.sign * sign; + if (tmp != 0) + { epr.index = -ind + 1; - epr.X[0] = (uint32_t) tmp; - epr.X[1] = (uint32_t) r[0]; - epr.X[2] = (uint32_t) r[1]; + epr.X[0] = (uint32_t)tmp; + epr.X[1] = (uint32_t)r[0]; + epr.X[2] = (uint32_t)r[1]; } - else { + else + { epr.index = -ind; - epr.X[0] = (uint32_t) r[0]; - epr.X[1] = (uint32_t) r[1]; - epr.X[2] = (uint32_t) r[2]; + epr.X[0] = (uint32_t)r[0]; + epr.X[1] = (uint32_t)r[1]; + epr.X[2] = (uint32_t)r[2]; } - *y = eprep_to_double( epr ); - return epx.sign*N; + *y = eprep_to_double(epr); + return epx.sign * N; } double reference_relaxed_cos(double x) { - if(isnan(x)) - return NAN; - return (float)cos((float)x); + if (isnan(x)) return NAN; + return (float)cos((float)x); } double reference_cos(double x) { int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return cos( x ); + int N = payne_hanek(&x, &exception); + if (exception) return cos(x); unsigned int c = N & 3; - switch ( c ) { - case 0: - return cos( x ); - case 1: - return -sin( x ); - case 2: - return -cos( x ); - case 3: - return sin( x ); + switch (c) + { + case 0: return cos(x); + case 1: return -sin(x); + case 2: return -cos(x); + case 3: return sin(x); } return 0.0; } -double reference_relaxed_sin(double x){ - return (float)sin((float)x); -} +double reference_relaxed_sin(double x) { return (float)sin((float)x); } double reference_sin(double x) { int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return sin( x ); + int N = payne_hanek(&x, &exception); + if (exception) return sin(x); int c = N & 3; - switch ( c ) { - case 0: - return sin( x ); - case 1: - return cos( x ); - case 2: - return -sin( x ); - case 3: - return -cos( x ); + switch (c) + { + case 0: return sin(x); + case 1: return cos(x); + case 2: return -sin(x); + case 3: return -cos(x); } return 0.0; } -double reference_relaxed_sincos(double x, double * y){ - *y = reference_relaxed_cos(x); - return reference_relaxed_sin(x); +double reference_relaxed_sincos(double x, double *y) +{ + *y = reference_relaxed_cos(x); + return reference_relaxed_sin(x); } double reference_sincos(double x, double *y) { int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) { - *y = cos( x ); - return sin( x ); + int N = payne_hanek(&x, &exception); + if (exception) + { + *y = cos(x); + return sin(x); } int c = N & 3; - switch ( c ) { - case 0: - *y = cos( x ); - return sin( x ); - case 1: - *y = -sin( x ); - return cos( x ); - case 2: - *y = -cos( x ); - return -sin( x ); - case 3: - *y = sin( x ); - return -cos( x ); + switch (c) + { + case 0: *y = cos(x); return sin(x); + case 1: *y = -sin(x); return cos(x); + case 2: *y = -cos(x); return -sin(x); + case 3: *y = sin(x); return -cos(x); } return 0.0; } -double reference_relaxed_tan(double x){ - return ((float) reference_relaxed_sin((float)x))/((float) reference_relaxed_cos((float)x)); +double reference_relaxed_tan(double x) +{ + return ((float)reference_relaxed_sin((float)x)) + / ((float)reference_relaxed_cos((float)x)); } double reference_tan(double x) { int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return tan( x ); + int N = payne_hanek(&x, &exception); + if (exception) return tan(x); int c = N & 3; - switch ( c ) { - case 0: - return tan( x ); - case 1: - return -1.0 / tan( x ); - case 2: - return tan( x ); - case 3: - return -1.0 / tan( x ); + switch (c) + { + case 0: return tan(x); + case 1: return -1.0 / tan(x); + case 2: return tan(x); + case 3: return -1.0 / tan(x); } return 0.0; } long double reference_cosl(long double xx) { - double x = (double) xx; + double x = (double)xx; int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return cosl( x ); + int N = payne_hanek(&x, &exception); + if (exception) return cosl(x); unsigned int c = N & 3; - switch ( c ) { - case 0: - return cosl( x ); - case 1: - return -sinl( x ); - case 2: - return -cosl( x ); - case 3: - return sinl( x ); + switch (c) + { + case 0: return cosl(x); + case 1: return -sinl(x); + case 2: return -cosl(x); + case 3: return sinl(x); } return 0.0; } @@ -3913,25 +4360,20 @@ long double reference_sinl(long double xx) { // we use system tanl after reduction which // can flush denorm input to zero so - //take care of it here. - if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 )) - return xx; + // take care of it here. + if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx; - double x = (double) xx; + double x = (double)xx; int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return sinl( x ); + int N = payne_hanek(&x, &exception); + if (exception) return sinl(x); int c = N & 3; - switch ( c ) { - case 0: - return sinl( x ); - case 1: - return cosl( x ); - case 2: - return -sinl( x ); - case 3: - return -cosl( x ); + switch (c) + { + case 0: return sinl(x); + case 1: return cosl(x); + case 2: return -sinl(x); + case 3: return -cosl(x); } return 0.0; } @@ -3940,34 +4382,28 @@ long double reference_sincosl(long double xx, long double *y) { // we use system tanl after reduction which // can flush denorm input to zero so - //take care of it here. - if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 )) + // take care of it here. + if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) { *y = cosl(xx); return xx; } - double x = (double) xx; + double x = (double)xx; int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) { - *y = cosl( x ); - return sinl( x ); + int N = payne_hanek(&x, &exception); + if (exception) + { + *y = cosl(x); + return sinl(x); } int c = N & 3; - switch ( c ) { - case 0: - *y = cosl( x ); - return sinl( x ); - case 1: - *y = -sinl( x ); - return cosl( x ); - case 2: - *y = -cosl( x ); - return -sinl( x ); - case 3: - *y = sinl( x ); - return -cosl( x ); + switch (c) + { + case 0: *y = cosl(x); return sinl(x); + case 1: *y = -sinl(x); return cosl(x); + case 2: *y = -cosl(x); return -sinl(x); + case 3: *y = sinl(x); return -cosl(x); } return 0.0; } @@ -3976,205 +4412,337 @@ long double reference_tanl(long double xx) { // we use system tanl after reduction which // can flush denorm input to zero so - //take care of it here. - if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 )) - return xx; + // take care of it here. + if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx; - double x = (double) xx; + double x = (double)xx; int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return tanl( x ); + int N = payne_hanek(&x, &exception); + if (exception) return tanl(x); int c = N & 3; - switch ( c ) { - case 0: - return tanl( x ); - case 1: - return -1.0 / tanl( x ); - case 2: - return tanl( x ); - case 3: - return -1.0 / tanl( x ); + switch (c) + { + case 0: return tanl(x); + case 1: return -1.0 / tanl(x); + case 2: return tanl(x); + case 3: return -1.0 / tanl(x); } return 0.0; } static double __loglTable1[64][3] = { -{HEX_DBL( +, 1, 5390948f40fea, +, 0 ), HEX_DBL( -, 1, a152f142a, -, 2 ), HEX_DBL( +, 1, f93e27b43bd2c, -, 40 )}, -{HEX_DBL( +, 1, 5015015015015, +, 0 ), HEX_DBL( -, 1, 921800925, -, 2 ), HEX_DBL( +, 1, 162432a1b8df7, -, 41 )}, -{HEX_DBL( +, 1, 4cab88725af6e, +, 0 ), HEX_DBL( -, 1, 8304d90c18, -, 2 ), HEX_DBL( +, 1, 80bb749056fe7, -, 40 )}, -{HEX_DBL( +, 1, 49539e3b2d066, +, 0 ), HEX_DBL( -, 1, 7418acebc, -, 2 ), HEX_DBL( +, 1, ceac7f0607711, -, 43 )}, -{HEX_DBL( +, 1, 460cbc7f5cf9a, +, 0 ), HEX_DBL( -, 1, 6552b49988, -, 2 ), HEX_DBL( +, 1, d8913d0e89fa, -, 42 )}, -{HEX_DBL( +, 1, 42d6625d51f86, +, 0 ), HEX_DBL( -, 1, 56b22e6b58, -, 2 ), HEX_DBL( +, 1, c7eaf515033a1, -, 44 )}, -{HEX_DBL( +, 1, 3fb013fb013fb, +, 0 ), HEX_DBL( -, 1, 48365e696, -, 2 ), HEX_DBL( +, 1, 434adcde7edc7, -, 41 )}, -{HEX_DBL( +, 1, 3c995a47babe7, +, 0 ), HEX_DBL( -, 1, 39de8e156, -, 2 ), HEX_DBL( +, 1, 8246f8e527754, -, 40 )}, -{HEX_DBL( +, 1, 3991c2c187f63, +, 0 ), HEX_DBL( -, 1, 2baa0c34c, -, 2 ), HEX_DBL( +, 1, e1513c28e180d, -, 42 )}, -{HEX_DBL( +, 1, 3698df3de0747, +, 0 ), HEX_DBL( -, 1, 1d982c9d58, -, 2 ), HEX_DBL( +, 1, 63ea3fed4b8a2, -, 40 )}, -{HEX_DBL( +, 1, 33ae45b57bcb1, +, 0 ), HEX_DBL( -, 1, 0fa848045, -, 2 ), HEX_DBL( +, 1, 32ccbacf1779b, -, 40 )}, -{HEX_DBL( +, 1, 30d190130d19, +, 0 ), HEX_DBL( -, 1, 01d9bbcfa8, -, 2 ), HEX_DBL( +, 1, e2bfeb2b884aa, -, 42 )}, -{HEX_DBL( +, 1, 2e025c04b8097, +, 0 ), HEX_DBL( -, 1, e857d3d37, -, 3 ), HEX_DBL( +, 1, d9309b4d2ea85, -, 40 )}, -{HEX_DBL( +, 1, 2b404ad012b4, +, 0 ), HEX_DBL( -, 1, cd3c712d4, -, 3 ), HEX_DBL( +, 1, ddf360962d7ab, -, 40 )}, -{HEX_DBL( +, 1, 288b01288b012, +, 0 ), HEX_DBL( -, 1, b2602497e, -, 3 ), HEX_DBL( +, 1, 597f8a121640f, -, 40 )}, -{HEX_DBL( +, 1, 25e22708092f1, +, 0 ), HEX_DBL( -, 1, 97c1cb13d, -, 3 ), HEX_DBL( +, 1, 02807d15580dc, -, 40 )}, -{HEX_DBL( +, 1, 23456789abcdf, +, 0 ), HEX_DBL( -, 1, 7d60496d, -, 3 ), HEX_DBL( +, 1, 12ce913d7a827, -, 41 )}, -{HEX_DBL( +, 1, 20b470c67c0d8, +, 0 ), HEX_DBL( -, 1, 633a8bf44, -, 3 ), HEX_DBL( +, 1, 0648bca9c96bd, -, 40 )}, -{HEX_DBL( +, 1, 1e2ef3b3fb874, +, 0 ), HEX_DBL( -, 1, 494f863b9, -, 3 ), HEX_DBL( +, 1, 066fceb89b0eb, -, 42 )}, -{HEX_DBL( +, 1, 1bb4a4046ed29, +, 0 ), HEX_DBL( -, 1, 2f9e32d5c, -, 3 ), HEX_DBL( +, 1, 17b8b6c4f846b, -, 46 )}, -{HEX_DBL( +, 1, 19453808ca29c, +, 0 ), HEX_DBL( -, 1, 162593187, -, 3 ), HEX_DBL( +, 1, 2c83506452154, -, 42 )}, -{HEX_DBL( +, 1, 16e0689427378, +, 0 ), HEX_DBL( -, 1, f9c95dc1e, -, 4 ), HEX_DBL( +, 1, dd5d2183150f3, -, 41 )}, -{HEX_DBL( +, 1, 1485f0e0acd3b, +, 0 ), HEX_DBL( -, 1, c7b528b72, -, 4 ), HEX_DBL( +, 1, 0e43c4f4e619d, -, 40 )}, -{HEX_DBL( +, 1, 12358e75d3033, +, 0 ), HEX_DBL( -, 1, 960caf9ac, -, 4 ), HEX_DBL( +, 1, 20fbfd5902a1e, -, 42 )}, -{HEX_DBL( +, 1, 0fef010fef01, +, 0 ), HEX_DBL( -, 1, 64ce26c08, -, 4 ), HEX_DBL( +, 1, 8ebeefb4ac467, -, 40 )}, -{HEX_DBL( +, 1, 0db20a88f4695, +, 0 ), HEX_DBL( -, 1, 33f7cde16, -, 4 ), HEX_DBL( +, 1, 30b3312da7a7d, -, 40 )}, -{HEX_DBL( +, 1, 0b7e6ec259dc7, +, 0 ), HEX_DBL( -, 1, 0387efbcc, -, 4 ), HEX_DBL( +, 1, 796f1632949c3, -, 40 )}, -{HEX_DBL( +, 1, 0953f39010953, +, 0 ), HEX_DBL( -, 1, a6f9c378, -, 5 ), HEX_DBL( +, 1, 1687e151172cc, -, 40 )}, -{HEX_DBL( +, 1, 073260a47f7c6, +, 0 ), HEX_DBL( -, 1, 47aa07358, -, 5 ), HEX_DBL( +, 1, 1f87e4a9cc778, -, 42 )}, -{HEX_DBL( +, 1, 05197f7d73404, +, 0 ), HEX_DBL( -, 1, d23afc498, -, 6 ), HEX_DBL( +, 1, b183a6b628487, -, 40 )}, -{HEX_DBL( +, 1, 03091b51f5e1a, +, 0 ), HEX_DBL( -, 1, 16a21e21, -, 6 ), HEX_DBL( +, 1, 7d75c58973ce5, -, 40 )}, -{HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 )}, -{HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 )}, -{HEX_DBL( +, 1, f44659e4a4271, -, 1 ), HEX_DBL( +, 1, 11cd1d51, -, 5 ), HEX_DBL( +, 1, 9a0d857e2f4b2, -, 40 )}, -{HEX_DBL( +, 1, ecc07b301ecc, -, 1 ), HEX_DBL( +, 1, c4dfab908, -, 5 ), HEX_DBL( +, 1, 55b53fce557fd, -, 40 )}, -{HEX_DBL( +, 1, e573ac901e573, -, 1 ), HEX_DBL( +, 1, 3aa2fdd26, -, 4 ), HEX_DBL( +, 1, f1cb0c9532089, -, 40 )}, -{HEX_DBL( +, 1, de5d6e3f8868a, -, 1 ), HEX_DBL( +, 1, 918a16e46, -, 4 ), HEX_DBL( +, 1, 9af0dcd65a6e1, -, 43 )}, -{HEX_DBL( +, 1, d77b654b82c33, -, 1 ), HEX_DBL( +, 1, e72ec117e, -, 4 ), HEX_DBL( +, 1, a5b93c4ebe124, -, 40 )}, -{HEX_DBL( +, 1, d0cb58f6ec074, -, 1 ), HEX_DBL( +, 1, 1dcd19755, -, 3 ), HEX_DBL( +, 1, 5be50e71ddc6c, -, 42 )}, -{HEX_DBL( +, 1, ca4b3055ee191, -, 1 ), HEX_DBL( +, 1, 476a9f983, -, 3 ), HEX_DBL( +, 1, ee9a798719e7f, -, 40 )}, -{HEX_DBL( +, 1, c3f8f01c3f8f, -, 1 ), HEX_DBL( +, 1, 70742d4ef, -, 3 ), HEX_DBL( +, 1, 3ff1352c1219c, -, 46 )}, -{HEX_DBL( +, 1, bdd2b899406f7, -, 1 ), HEX_DBL( +, 1, 98edd077e, -, 3 ), HEX_DBL( +, 1, c383cd11362f4, -, 41 )}, -{HEX_DBL( +, 1, b7d6c3dda338b, -, 1 ), HEX_DBL( +, 1, c0db6cdd9, -, 3 ), HEX_DBL( +, 1, 37bd85b1a824e, -, 41 )}, -{HEX_DBL( +, 1, b2036406c80d9, -, 1 ), HEX_DBL( +, 1, e840be74e, -, 3 ), HEX_DBL( +, 1, a9334d525e1ec, -, 41 )}, -{HEX_DBL( +, 1, ac5701ac5701a, -, 1 ), HEX_DBL( +, 1, 0790adbb, -, 2 ), HEX_DBL( +, 1, 8060bfb6a491, -, 41 )}, -{HEX_DBL( +, 1, a6d01a6d01a6d, -, 1 ), HEX_DBL( +, 1, 1ac05b2918, -, 2 ), HEX_DBL( +, 1, c1c161471580a, -, 40 )}, -{HEX_DBL( +, 1, a16d3f97a4b01, -, 1 ), HEX_DBL( +, 1, 2db10fc4d8, -, 2 ), HEX_DBL( +, 1, ab1aa62214581, -, 42 )}, -{HEX_DBL( +, 1, 9c2d14ee4a101, -, 1 ), HEX_DBL( +, 1, 406463b1b, -, 2 ), HEX_DBL( +, 1, 12e95dbda6611, -, 44 )}, -{HEX_DBL( +, 1, 970e4f80cb872, -, 1 ), HEX_DBL( +, 1, 52dbdfc4c8, -, 2 ), HEX_DBL( +, 1, 6b53fee511af, -, 42 )}, -{HEX_DBL( +, 1, 920fb49d0e228, -, 1 ), HEX_DBL( +, 1, 6518fe467, -, 2 ), HEX_DBL( +, 1, eea7d7d7d1764, -, 40 )}, -{HEX_DBL( +, 1, 8d3018d3018d3, -, 1 ), HEX_DBL( +, 1, 771d2ba7e8, -, 2 ), HEX_DBL( +, 1, ecefa8d4fab97, -, 40 )}, -{HEX_DBL( +, 1, 886e5f0abb049, -, 1 ), HEX_DBL( +, 1, 88e9c72e08, -, 2 ), HEX_DBL( +, 1, 913ea3d33fd14, -, 41 )}, -{HEX_DBL( +, 1, 83c977ab2bedd, -, 1 ), HEX_DBL( +, 1, 9a802391e, -, 2 ), HEX_DBL( +, 1, 197e845877c94, -, 41 )}, -{HEX_DBL( +, 1, 7f405fd017f4, -, 1 ), HEX_DBL( +, 1, abe18797f, -, 2 ), HEX_DBL( +, 1, f4a52f8e8a81, -, 42 )}, -{HEX_DBL( +, 1, 7ad2208e0ecc3, -, 1 ), HEX_DBL( +, 1, bd0f2e9e78, -, 2 ), HEX_DBL( +, 1, 031f4336644cc, -, 42 )}, -{HEX_DBL( +, 1, 767dce434a9b1, -, 1 ), HEX_DBL( +, 1, ce0a4923a, -, 2 ), HEX_DBL( +, 1, 61f33c897020c, -, 40 )}, -{HEX_DBL( +, 1, 724287f46debc, -, 1 ), HEX_DBL( +, 1, ded3fd442, -, 2 ), HEX_DBL( +, 1, b2632e830632, -, 41 )}, -{HEX_DBL( +, 1, 6e1f76b4337c6, -, 1 ), HEX_DBL( +, 1, ef6d673288, -, 2 ), HEX_DBL( +, 1, 888ec245a0bf, -, 40 )}, -{HEX_DBL( +, 1, 6a13cd153729, -, 1 ), HEX_DBL( +, 1, ffd799a838, -, 2 ), HEX_DBL( +, 1, fe6f3b2f5fc8e, -, 40 )}, -{HEX_DBL( +, 1, 661ec6a5122f9, -, 1 ), HEX_DBL( +, 1, 0809cf27f4, -, 1 ), HEX_DBL( +, 1, 81eaa9ef284dd, -, 40 )}, -{HEX_DBL( +, 1, 623fa7701623f, -, 1 ), HEX_DBL( +, 1, 10113b153c, -, 1 ), HEX_DBL( +, 1, 1d7b07d6b1143, -, 42 )}, -{HEX_DBL( +, 1, 5e75bb8d015e7, -, 1 ), HEX_DBL( +, 1, 18028cf728, -, 1 ), HEX_DBL( +, 1, 76b100b1f6c6, -, 41 )}, -{HEX_DBL( +, 1, 5ac056b015ac, -, 1 ), HEX_DBL( +, 1, 1fde3d30e8, -, 1 ), HEX_DBL( +, 1, 26faeb9870945, -, 45 )}, -{HEX_DBL( +, 1, 571ed3c506b39, -, 1 ), HEX_DBL( +, 1, 27a4c0585c, -, 1 ), HEX_DBL( +, 1, 7f2c5344d762b, -, 42 )} + { HEX_DBL(+, 1, 5390948f40fea, +, 0), HEX_DBL(-, 1, a152f142a, -, 2), + HEX_DBL(+, 1, f93e27b43bd2c, -, 40) }, + { HEX_DBL(+, 1, 5015015015015, +, 0), HEX_DBL(-, 1, 921800925, -, 2), + HEX_DBL(+, 1, 162432a1b8df7, -, 41) }, + { HEX_DBL(+, 1, 4cab88725af6e, +, 0), HEX_DBL(-, 1, 8304d90c18, -, 2), + HEX_DBL(+, 1, 80bb749056fe7, -, 40) }, + { HEX_DBL(+, 1, 49539e3b2d066, +, 0), HEX_DBL(-, 1, 7418acebc, -, 2), + HEX_DBL(+, 1, ceac7f0607711, -, 43) }, + { HEX_DBL(+, 1, 460cbc7f5cf9a, +, 0), HEX_DBL(-, 1, 6552b49988, -, 2), + HEX_DBL(+, 1, d8913d0e89fa, -, 42) }, + { HEX_DBL(+, 1, 42d6625d51f86, +, 0), HEX_DBL(-, 1, 56b22e6b58, -, 2), + HEX_DBL(+, 1, c7eaf515033a1, -, 44) }, + { HEX_DBL(+, 1, 3fb013fb013fb, +, 0), HEX_DBL(-, 1, 48365e696, -, 2), + HEX_DBL(+, 1, 434adcde7edc7, -, 41) }, + { HEX_DBL(+, 1, 3c995a47babe7, +, 0), HEX_DBL(-, 1, 39de8e156, -, 2), + HEX_DBL(+, 1, 8246f8e527754, -, 40) }, + { HEX_DBL(+, 1, 3991c2c187f63, +, 0), HEX_DBL(-, 1, 2baa0c34c, -, 2), + HEX_DBL(+, 1, e1513c28e180d, -, 42) }, + { HEX_DBL(+, 1, 3698df3de0747, +, 0), HEX_DBL(-, 1, 1d982c9d58, -, 2), + HEX_DBL(+, 1, 63ea3fed4b8a2, -, 40) }, + { HEX_DBL(+, 1, 33ae45b57bcb1, +, 0), HEX_DBL(-, 1, 0fa848045, -, 2), + HEX_DBL(+, 1, 32ccbacf1779b, -, 40) }, + { HEX_DBL(+, 1, 30d190130d19, +, 0), HEX_DBL(-, 1, 01d9bbcfa8, -, 2), + HEX_DBL(+, 1, e2bfeb2b884aa, -, 42) }, + { HEX_DBL(+, 1, 2e025c04b8097, +, 0), HEX_DBL(-, 1, e857d3d37, -, 3), + HEX_DBL(+, 1, d9309b4d2ea85, -, 40) }, + { HEX_DBL(+, 1, 2b404ad012b4, +, 0), HEX_DBL(-, 1, cd3c712d4, -, 3), + HEX_DBL(+, 1, ddf360962d7ab, -, 40) }, + { HEX_DBL(+, 1, 288b01288b012, +, 0), HEX_DBL(-, 1, b2602497e, -, 3), + HEX_DBL(+, 1, 597f8a121640f, -, 40) }, + { HEX_DBL(+, 1, 25e22708092f1, +, 0), HEX_DBL(-, 1, 97c1cb13d, -, 3), + HEX_DBL(+, 1, 02807d15580dc, -, 40) }, + { HEX_DBL(+, 1, 23456789abcdf, +, 0), HEX_DBL(-, 1, 7d60496d, -, 3), + HEX_DBL(+, 1, 12ce913d7a827, -, 41) }, + { HEX_DBL(+, 1, 20b470c67c0d8, +, 0), HEX_DBL(-, 1, 633a8bf44, -, 3), + HEX_DBL(+, 1, 0648bca9c96bd, -, 40) }, + { HEX_DBL(+, 1, 1e2ef3b3fb874, +, 0), HEX_DBL(-, 1, 494f863b9, -, 3), + HEX_DBL(+, 1, 066fceb89b0eb, -, 42) }, + { HEX_DBL(+, 1, 1bb4a4046ed29, +, 0), HEX_DBL(-, 1, 2f9e32d5c, -, 3), + HEX_DBL(+, 1, 17b8b6c4f846b, -, 46) }, + { HEX_DBL(+, 1, 19453808ca29c, +, 0), HEX_DBL(-, 1, 162593187, -, 3), + HEX_DBL(+, 1, 2c83506452154, -, 42) }, + { HEX_DBL(+, 1, 16e0689427378, +, 0), HEX_DBL(-, 1, f9c95dc1e, -, 4), + HEX_DBL(+, 1, dd5d2183150f3, -, 41) }, + { HEX_DBL(+, 1, 1485f0e0acd3b, +, 0), HEX_DBL(-, 1, c7b528b72, -, 4), + HEX_DBL(+, 1, 0e43c4f4e619d, -, 40) }, + { HEX_DBL(+, 1, 12358e75d3033, +, 0), HEX_DBL(-, 1, 960caf9ac, -, 4), + HEX_DBL(+, 1, 20fbfd5902a1e, -, 42) }, + { HEX_DBL(+, 1, 0fef010fef01, +, 0), HEX_DBL(-, 1, 64ce26c08, -, 4), + HEX_DBL(+, 1, 8ebeefb4ac467, -, 40) }, + { HEX_DBL(+, 1, 0db20a88f4695, +, 0), HEX_DBL(-, 1, 33f7cde16, -, 4), + HEX_DBL(+, 1, 30b3312da7a7d, -, 40) }, + { HEX_DBL(+, 1, 0b7e6ec259dc7, +, 0), HEX_DBL(-, 1, 0387efbcc, -, 4), + HEX_DBL(+, 1, 796f1632949c3, -, 40) }, + { HEX_DBL(+, 1, 0953f39010953, +, 0), HEX_DBL(-, 1, a6f9c378, -, 5), + HEX_DBL(+, 1, 1687e151172cc, -, 40) }, + { HEX_DBL(+, 1, 073260a47f7c6, +, 0), HEX_DBL(-, 1, 47aa07358, -, 5), + HEX_DBL(+, 1, 1f87e4a9cc778, -, 42) }, + { HEX_DBL(+, 1, 05197f7d73404, +, 0), HEX_DBL(-, 1, d23afc498, -, 6), + HEX_DBL(+, 1, b183a6b628487, -, 40) }, + { HEX_DBL(+, 1, 03091b51f5e1a, +, 0), HEX_DBL(-, 1, 16a21e21, -, 6), + HEX_DBL(+, 1, 7d75c58973ce5, -, 40) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, f44659e4a4271, -, 1), HEX_DBL(+, 1, 11cd1d51, -, 5), + HEX_DBL(+, 1, 9a0d857e2f4b2, -, 40) }, + { HEX_DBL(+, 1, ecc07b301ecc, -, 1), HEX_DBL(+, 1, c4dfab908, -, 5), + HEX_DBL(+, 1, 55b53fce557fd, -, 40) }, + { HEX_DBL(+, 1, e573ac901e573, -, 1), HEX_DBL(+, 1, 3aa2fdd26, -, 4), + HEX_DBL(+, 1, f1cb0c9532089, -, 40) }, + { HEX_DBL(+, 1, de5d6e3f8868a, -, 1), HEX_DBL(+, 1, 918a16e46, -, 4), + HEX_DBL(+, 1, 9af0dcd65a6e1, -, 43) }, + { HEX_DBL(+, 1, d77b654b82c33, -, 1), HEX_DBL(+, 1, e72ec117e, -, 4), + HEX_DBL(+, 1, a5b93c4ebe124, -, 40) }, + { HEX_DBL(+, 1, d0cb58f6ec074, -, 1), HEX_DBL(+, 1, 1dcd19755, -, 3), + HEX_DBL(+, 1, 5be50e71ddc6c, -, 42) }, + { HEX_DBL(+, 1, ca4b3055ee191, -, 1), HEX_DBL(+, 1, 476a9f983, -, 3), + HEX_DBL(+, 1, ee9a798719e7f, -, 40) }, + { HEX_DBL(+, 1, c3f8f01c3f8f, -, 1), HEX_DBL(+, 1, 70742d4ef, -, 3), + HEX_DBL(+, 1, 3ff1352c1219c, -, 46) }, + { HEX_DBL(+, 1, bdd2b899406f7, -, 1), HEX_DBL(+, 1, 98edd077e, -, 3), + HEX_DBL(+, 1, c383cd11362f4, -, 41) }, + { HEX_DBL(+, 1, b7d6c3dda338b, -, 1), HEX_DBL(+, 1, c0db6cdd9, -, 3), + HEX_DBL(+, 1, 37bd85b1a824e, -, 41) }, + { HEX_DBL(+, 1, b2036406c80d9, -, 1), HEX_DBL(+, 1, e840be74e, -, 3), + HEX_DBL(+, 1, a9334d525e1ec, -, 41) }, + { HEX_DBL(+, 1, ac5701ac5701a, -, 1), HEX_DBL(+, 1, 0790adbb, -, 2), + HEX_DBL(+, 1, 8060bfb6a491, -, 41) }, + { HEX_DBL(+, 1, a6d01a6d01a6d, -, 1), HEX_DBL(+, 1, 1ac05b2918, -, 2), + HEX_DBL(+, 1, c1c161471580a, -, 40) }, + { HEX_DBL(+, 1, a16d3f97a4b01, -, 1), HEX_DBL(+, 1, 2db10fc4d8, -, 2), + HEX_DBL(+, 1, ab1aa62214581, -, 42) }, + { HEX_DBL(+, 1, 9c2d14ee4a101, -, 1), HEX_DBL(+, 1, 406463b1b, -, 2), + HEX_DBL(+, 1, 12e95dbda6611, -, 44) }, + { HEX_DBL(+, 1, 970e4f80cb872, -, 1), HEX_DBL(+, 1, 52dbdfc4c8, -, 2), + HEX_DBL(+, 1, 6b53fee511af, -, 42) }, + { HEX_DBL(+, 1, 920fb49d0e228, -, 1), HEX_DBL(+, 1, 6518fe467, -, 2), + HEX_DBL(+, 1, eea7d7d7d1764, -, 40) }, + { HEX_DBL(+, 1, 8d3018d3018d3, -, 1), HEX_DBL(+, 1, 771d2ba7e8, -, 2), + HEX_DBL(+, 1, ecefa8d4fab97, -, 40) }, + { HEX_DBL(+, 1, 886e5f0abb049, -, 1), HEX_DBL(+, 1, 88e9c72e08, -, 2), + HEX_DBL(+, 1, 913ea3d33fd14, -, 41) }, + { HEX_DBL(+, 1, 83c977ab2bedd, -, 1), HEX_DBL(+, 1, 9a802391e, -, 2), + HEX_DBL(+, 1, 197e845877c94, -, 41) }, + { HEX_DBL(+, 1, 7f405fd017f4, -, 1), HEX_DBL(+, 1, abe18797f, -, 2), + HEX_DBL(+, 1, f4a52f8e8a81, -, 42) }, + { HEX_DBL(+, 1, 7ad2208e0ecc3, -, 1), HEX_DBL(+, 1, bd0f2e9e78, -, 2), + HEX_DBL(+, 1, 031f4336644cc, -, 42) }, + { HEX_DBL(+, 1, 767dce434a9b1, -, 1), HEX_DBL(+, 1, ce0a4923a, -, 2), + HEX_DBL(+, 1, 61f33c897020c, -, 40) }, + { HEX_DBL(+, 1, 724287f46debc, -, 1), HEX_DBL(+, 1, ded3fd442, -, 2), + HEX_DBL(+, 1, b2632e830632, -, 41) }, + { HEX_DBL(+, 1, 6e1f76b4337c6, -, 1), HEX_DBL(+, 1, ef6d673288, -, 2), + HEX_DBL(+, 1, 888ec245a0bf, -, 40) }, + { HEX_DBL(+, 1, 6a13cd153729, -, 1), HEX_DBL(+, 1, ffd799a838, -, 2), + HEX_DBL(+, 1, fe6f3b2f5fc8e, -, 40) }, + { HEX_DBL(+, 1, 661ec6a5122f9, -, 1), HEX_DBL(+, 1, 0809cf27f4, -, 1), + HEX_DBL(+, 1, 81eaa9ef284dd, -, 40) }, + { HEX_DBL(+, 1, 623fa7701623f, -, 1), HEX_DBL(+, 1, 10113b153c, -, 1), + HEX_DBL(+, 1, 1d7b07d6b1143, -, 42) }, + { HEX_DBL(+, 1, 5e75bb8d015e7, -, 1), HEX_DBL(+, 1, 18028cf728, -, 1), + HEX_DBL(+, 1, 76b100b1f6c6, -, 41) }, + { HEX_DBL(+, 1, 5ac056b015ac, -, 1), HEX_DBL(+, 1, 1fde3d30e8, -, 1), + HEX_DBL(+, 1, 26faeb9870945, -, 45) }, + { HEX_DBL(+, 1, 571ed3c506b39, -, 1), HEX_DBL(+, 1, 27a4c0585c, -, 1), + HEX_DBL(+, 1, 7f2c5344d762b, -, 42) } }; static double __loglTable2[64][3] = { -{HEX_DBL( +, 1, 01fbe7f0a1be6, +, 0 ), HEX_DBL( -, 1, 6cf6ddd26112a, -, 7 ), HEX_DBL( +, 1, 0725e5755e314, -, 60 )}, -{HEX_DBL( +, 1, 01eba93a97b12, +, 0 ), HEX_DBL( -, 1, 6155b1d99f603, -, 7 ), HEX_DBL( +, 1, 4bcea073117f4, -, 60 )}, -{HEX_DBL( +, 1, 01db6c9029cd1, +, 0 ), HEX_DBL( -, 1, 55b54153137ff, -, 7 ), HEX_DBL( +, 1, 21e8faccad0ec, -, 61 )}, -{HEX_DBL( +, 1, 01cb31f0f534c, +, 0 ), HEX_DBL( -, 1, 4a158c27245bd, -, 7 ), HEX_DBL( +, 1, 1a5b7bfbf35d3, -, 60 )}, -{HEX_DBL( +, 1, 01baf95c9723c, +, 0 ), HEX_DBL( -, 1, 3e76923e3d678, -, 7 ), HEX_DBL( +, 1, eee400eb5fe34, -, 62 )}, -{HEX_DBL( +, 1, 01aac2d2acee6, +, 0 ), HEX_DBL( -, 1, 32d85380ce776, -, 7 ), HEX_DBL( +, 1, cbf7a513937bd, -, 61 )}, -{HEX_DBL( +, 1, 019a8e52d401e, +, 0 ), HEX_DBL( -, 1, 273acfd74be72, -, 7 ), HEX_DBL( +, 1, 5c64599efa5e6, -, 60 )}, -{HEX_DBL( +, 1, 018a5bdca9e42, +, 0 ), HEX_DBL( -, 1, 1b9e072a2e65, -, 7 ), HEX_DBL( +, 1, 364180e0a5d37, -, 60 )}, -{HEX_DBL( +, 1, 017a2b6fcc33e, +, 0 ), HEX_DBL( -, 1, 1001f961f3243, -, 7 ), HEX_DBL( +, 1, 63d795746f216, -, 60 )}, -{HEX_DBL( +, 1, 0169fd0bd8a8a, +, 0 ), HEX_DBL( -, 1, 0466a6671bca4, -, 7 ), HEX_DBL( +, 1, 4c99ff1907435, -, 60 )}, -{HEX_DBL( +, 1, 0159d0b06d129, +, 0 ), HEX_DBL( -, 1, f1981c445cd05, -, 8 ), HEX_DBL( +, 1, 4bfff6366b723, -, 62 )}, -{HEX_DBL( +, 1, 0149a65d275a6, +, 0 ), HEX_DBL( -, 1, da6460f76ab8c, -, 8 ), HEX_DBL( +, 1, 9c5404f47589c, -, 61 )}, -{HEX_DBL( +, 1, 01397e11a581b, +, 0 ), HEX_DBL( -, 1, c3321ab87f4ef, -, 8 ), HEX_DBL( +, 1, c0da537429cea, -, 61 )}, -{HEX_DBL( +, 1, 012957cd85a28, +, 0 ), HEX_DBL( -, 1, ac014958c112c, -, 8 ), HEX_DBL( +, 1, 000c2a1b595e3, -, 64 )}, -{HEX_DBL( +, 1, 0119339065ef7, +, 0 ), HEX_DBL( -, 1, 94d1eca95f67a, -, 8 ), HEX_DBL( +, 1, d8d20b0564d5, -, 61 )}, -{HEX_DBL( +, 1, 01091159e4b3d, +, 0 ), HEX_DBL( -, 1, 7da4047b92b3e, -, 8 ), HEX_DBL( +, 1, 6194a5d68cf2, -, 66 )}, -{HEX_DBL( +, 1, 00f8f129a0535, +, 0 ), HEX_DBL( -, 1, 667790a09bf77, -, 8 ), HEX_DBL( +, 1, ca230e0bea645, -, 61 )}, -{HEX_DBL( +, 1, 00e8d2ff374a1, +, 0 ), HEX_DBL( -, 1, 4f4c90e9c4ead, -, 8 ), HEX_DBL( +, 1, 1de3e7f350c1, -, 61 )}, -{HEX_DBL( +, 1, 00d8b6da482ce, +, 0 ), HEX_DBL( -, 1, 3823052860649, -, 8 ), HEX_DBL( +, 1, 5789b4c5891b8, -, 64 )}, -{HEX_DBL( +, 1, 00c89cba71a8c, +, 0 ), HEX_DBL( -, 1, 20faed2dc9a9e, -, 8 ), HEX_DBL( +, 1, 9e7c40f9839fd, -, 62 )}, -{HEX_DBL( +, 1, 00b8849f52834, +, 0 ), HEX_DBL( -, 1, 09d448cb65014, -, 8 ), HEX_DBL( +, 1, 387e3e9b6d02, -, 62 )}, -{HEX_DBL( +, 1, 00a86e88899a4, +, 0 ), HEX_DBL( -, 1, e55e2fa53ebf1, -, 9 ), HEX_DBL( +, 1, cdaa71fddfddf, -, 62 )}, -{HEX_DBL( +, 1, 00985a75b5e3f, +, 0 ), HEX_DBL( -, 1, b716b429dce0f, -, 9 ), HEX_DBL( +, 1, 2f2af081367bf, -, 63 )}, -{HEX_DBL( +, 1, 00884866766ee, +, 0 ), HEX_DBL( -, 1, 88d21ec7a16d7, -, 9 ), HEX_DBL( +, 1, fb95c228d6f16, -, 62 )}, -{HEX_DBL( +, 1, 0078385a6a61d, +, 0 ), HEX_DBL( -, 1, 5a906f219a9e8, -, 9 ), HEX_DBL( +, 1, 18aff10a89f29, -, 64 )}, -{HEX_DBL( +, 1, 00682a5130fbe, +, 0 ), HEX_DBL( -, 1, 2c51a4dae87f1, -, 9 ), HEX_DBL( +, 1, bcc7e33ddde3, -, 63 )}, -{HEX_DBL( +, 1, 00581e4a69944, +, 0 ), HEX_DBL( -, 1, fc2b7f2d782b1, -, 10 ), HEX_DBL( +, 1, fe3ef3300a9fa, -, 64 )}, -{HEX_DBL( +, 1, 00481445b39a8, +, 0 ), HEX_DBL( -, 1, 9fb97df0b0b83, -, 10 ), HEX_DBL( +, 1, 0d9a601f2f324, -, 65 )}, -{HEX_DBL( +, 1, 00380c42ae963, +, 0 ), HEX_DBL( -, 1, 434d4546227ae, -, 10 ), HEX_DBL( +, 1, 0b9b6a5868f33, -, 63 )}, -{HEX_DBL( +, 1, 00280640fa271, +, 0 ), HEX_DBL( -, 1, cdcda8e930c19, -, 11 ), HEX_DBL( +, 1, 3d424ab39f789, -, 64 )}, -{HEX_DBL( +, 1, 0018024036051, +, 0 ), HEX_DBL( -, 1, 150c558601261, -, 11 ), HEX_DBL( +, 1, 285bb90327a0f, -, 64 )}, -{HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 )}, -{HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 )}, -{HEX_DBL( +, 1, ffa011fca0a1e, -, 1 ), HEX_DBL( +, 1, 14e5640c4197b, -, 10 ), HEX_DBL( +, 1, 95728136ae401, -, 63 )}, -{HEX_DBL( +, 1, ff6031f064e07, -, 1 ), HEX_DBL( +, 1, cd61806bf532d, -, 10 ), HEX_DBL( +, 1, 568a4f35d8538, -, 63 )}, -{HEX_DBL( +, 1, ff2061d532b9c, -, 1 ), HEX_DBL( +, 1, 42e34af550eda, -, 9 ), HEX_DBL( +, 1, 8f69cee55fec, -, 62 )}, -{HEX_DBL( +, 1, fee0a1a513253, -, 1 ), HEX_DBL( +, 1, 9f0a5523902ea, -, 9 ), HEX_DBL( +, 1, daec734b11615, -, 63 )}, -{HEX_DBL( +, 1, fea0f15a12139, -, 1 ), HEX_DBL( +, 1, fb25e19f11b26, -, 9 ), HEX_DBL( +, 1, 8bafca62941da, -, 62 )}, -{HEX_DBL( +, 1, fe6150ee3e6d4, -, 1 ), HEX_DBL( +, 1, 2b9af9a28e282, -, 8 ), HEX_DBL( +, 1, 0fd3674e1dc5b, -, 61 )}, -{HEX_DBL( +, 1, fe21c05baa109, -, 1 ), HEX_DBL( +, 1, 599d4678f24b9, -, 8 ), HEX_DBL( +, 1, dafce1f09937b, -, 61 )}, -{HEX_DBL( +, 1, fde23f9c69cf9, -, 1 ), HEX_DBL( +, 1, 8799d8c046eb, -, 8 ), HEX_DBL( +, 1, ffa0ce0bdd217, -, 65 )}, -{HEX_DBL( +, 1, fda2ceaa956e8, -, 1 ), HEX_DBL( +, 1, b590b1e5951ee, -, 8 ), HEX_DBL( +, 1, 645a769232446, -, 62 )}, -{HEX_DBL( +, 1, fd636d8047a1f, -, 1 ), HEX_DBL( +, 1, e381d3555dbcf, -, 8 ), HEX_DBL( +, 1, 882320d368331, -, 61 )}, -{HEX_DBL( +, 1, fd241c179e0cc, -, 1 ), HEX_DBL( +, 1, 08b69f3dccde, -, 7 ), HEX_DBL( +, 1, 01ad5065aba9e, -, 61 )}, -{HEX_DBL( +, 1, fce4da6ab93e8, -, 1 ), HEX_DBL( +, 1, 1fa97a61dd298, -, 7 ), HEX_DBL( +, 1, 84cd1f931ae34, -, 60 )}, -{HEX_DBL( +, 1, fca5a873bcb19, -, 1 ), HEX_DBL( +, 1, 36997bcc54a3f, -, 7 ), HEX_DBL( +, 1, 1485e97eaee03, -, 60 )}, -{HEX_DBL( +, 1, fc66862ccec93, -, 1 ), HEX_DBL( +, 1, 4d86a43264a4f, -, 7 ), HEX_DBL( +, 1, c75e63370988b, -, 61 )}, -{HEX_DBL( +, 1, fc27739018cfe, -, 1 ), HEX_DBL( +, 1, 6470f448fb09d, -, 7 ), HEX_DBL( +, 1, d7361eeaed0a1, -, 65 )}, -{HEX_DBL( +, 1, fbe87097c6f5a, -, 1 ), HEX_DBL( +, 1, 7b586cc4c2523, -, 7 ), HEX_DBL( +, 1, b3df952cc473c, -, 61 )}, -{HEX_DBL( +, 1, fba97d3e084dd, -, 1 ), HEX_DBL( +, 1, 923d0e5a21e06, -, 7 ), HEX_DBL( +, 1, cf56c7b64ae5d, -, 62 )}, -{HEX_DBL( +, 1, fb6a997d0ecdc, -, 1 ), HEX_DBL( +, 1, a91ed9bd3df9a, -, 7 ), HEX_DBL( +, 1, b957bdcd89e43, -, 61 )}, -{HEX_DBL( +, 1, fb2bc54f0f4ab, -, 1 ), HEX_DBL( +, 1, bffdcfa1f7fbb, -, 7 ), HEX_DBL( +, 1, ea8cad9a21771, -, 62 )}, -{HEX_DBL( +, 1, faed00ae41783, -, 1 ), HEX_DBL( +, 1, d6d9f0bbee6f6, -, 7 ), HEX_DBL( +, 1, 5762a9af89c82, -, 60 )}, -{HEX_DBL( +, 1, faae4b94dfe64, -, 1 ), HEX_DBL( +, 1, edb33dbe7d335, -, 7 ), HEX_DBL( +, 1, 21e24fc245697, -, 62 )}, -{HEX_DBL( +, 1, fa6fa5fd27ff8, -, 1 ), HEX_DBL( +, 1, 0244dbae5ed05, -, 6 ), HEX_DBL( +, 1, 12ef51b967102, -, 60 )}, -{HEX_DBL( +, 1, fa310fe15a078, -, 1 ), HEX_DBL( +, 1, 0daeaf24c3529, -, 6 ), HEX_DBL( +, 1, 10d3cfca60b45, -, 59 )}, -{HEX_DBL( +, 1, f9f2893bb9192, -, 1 ), HEX_DBL( +, 1, 1917199bb66bc, -, 6 ), HEX_DBL( +, 1, 6cf6034c32e19, -, 60 )}, -{HEX_DBL( +, 1, f9b412068b247, -, 1 ), HEX_DBL( +, 1, 247e1b6c615d5, -, 6 ), HEX_DBL( +, 1, 42f0fffa229f7, -, 61 )}, -{HEX_DBL( +, 1, f975aa3c18ed6, -, 1 ), HEX_DBL( +, 1, 2fe3b4efcc5ad, -, 6 ), HEX_DBL( +, 1, 70106136a8919, -, 60 )}, -{HEX_DBL( +, 1, f93751d6ae09b, -, 1 ), HEX_DBL( +, 1, 3b47e67edea93, -, 6 ), HEX_DBL( +, 1, 38dd5a4f6959a, -, 59 )}, -{HEX_DBL( +, 1, f8f908d098df6, -, 1 ), HEX_DBL( +, 1, 46aab0725ea6c, -, 6 ), HEX_DBL( +, 1, 821fc1e799e01, -, 60 )}, -{HEX_DBL( +, 1, f8bacf242aa2c, -, 1 ), HEX_DBL( +, 1, 520c1322f1e4e, -, 6 ), HEX_DBL( +, 1, 129dcda3ad563, -, 60 )}, -{HEX_DBL( +, 1, f87ca4cbb755, -, 1 ), HEX_DBL( +, 1, 5d6c0ee91d2ab, -, 6 ), HEX_DBL( +, 1, c5b190c04606e, -, 62 )}, -{HEX_DBL( +, 1, f83e89c195c25, -, 1 ), HEX_DBL( +, 1, 68caa41d448c3, -, 6 ), HEX_DBL( +, 1, 4723441195ac9, -, 59 )} + { HEX_DBL(+, 1, 01fbe7f0a1be6, +, 0), HEX_DBL(-, 1, 6cf6ddd26112a, -, 7), + HEX_DBL(+, 1, 0725e5755e314, -, 60) }, + { HEX_DBL(+, 1, 01eba93a97b12, +, 0), HEX_DBL(-, 1, 6155b1d99f603, -, 7), + HEX_DBL(+, 1, 4bcea073117f4, -, 60) }, + { HEX_DBL(+, 1, 01db6c9029cd1, +, 0), HEX_DBL(-, 1, 55b54153137ff, -, 7), + HEX_DBL(+, 1, 21e8faccad0ec, -, 61) }, + { HEX_DBL(+, 1, 01cb31f0f534c, +, 0), HEX_DBL(-, 1, 4a158c27245bd, -, 7), + HEX_DBL(+, 1, 1a5b7bfbf35d3, -, 60) }, + { HEX_DBL(+, 1, 01baf95c9723c, +, 0), HEX_DBL(-, 1, 3e76923e3d678, -, 7), + HEX_DBL(+, 1, eee400eb5fe34, -, 62) }, + { HEX_DBL(+, 1, 01aac2d2acee6, +, 0), HEX_DBL(-, 1, 32d85380ce776, -, 7), + HEX_DBL(+, 1, cbf7a513937bd, -, 61) }, + { HEX_DBL(+, 1, 019a8e52d401e, +, 0), HEX_DBL(-, 1, 273acfd74be72, -, 7), + HEX_DBL(+, 1, 5c64599efa5e6, -, 60) }, + { HEX_DBL(+, 1, 018a5bdca9e42, +, 0), HEX_DBL(-, 1, 1b9e072a2e65, -, 7), + HEX_DBL(+, 1, 364180e0a5d37, -, 60) }, + { HEX_DBL(+, 1, 017a2b6fcc33e, +, 0), HEX_DBL(-, 1, 1001f961f3243, -, 7), + HEX_DBL(+, 1, 63d795746f216, -, 60) }, + { HEX_DBL(+, 1, 0169fd0bd8a8a, +, 0), HEX_DBL(-, 1, 0466a6671bca4, -, 7), + HEX_DBL(+, 1, 4c99ff1907435, -, 60) }, + { HEX_DBL(+, 1, 0159d0b06d129, +, 0), HEX_DBL(-, 1, f1981c445cd05, -, 8), + HEX_DBL(+, 1, 4bfff6366b723, -, 62) }, + { HEX_DBL(+, 1, 0149a65d275a6, +, 0), HEX_DBL(-, 1, da6460f76ab8c, -, 8), + HEX_DBL(+, 1, 9c5404f47589c, -, 61) }, + { HEX_DBL(+, 1, 01397e11a581b, +, 0), HEX_DBL(-, 1, c3321ab87f4ef, -, 8), + HEX_DBL(+, 1, c0da537429cea, -, 61) }, + { HEX_DBL(+, 1, 012957cd85a28, +, 0), HEX_DBL(-, 1, ac014958c112c, -, 8), + HEX_DBL(+, 1, 000c2a1b595e3, -, 64) }, + { HEX_DBL(+, 1, 0119339065ef7, +, 0), HEX_DBL(-, 1, 94d1eca95f67a, -, 8), + HEX_DBL(+, 1, d8d20b0564d5, -, 61) }, + { HEX_DBL(+, 1, 01091159e4b3d, +, 0), HEX_DBL(-, 1, 7da4047b92b3e, -, 8), + HEX_DBL(+, 1, 6194a5d68cf2, -, 66) }, + { HEX_DBL(+, 1, 00f8f129a0535, +, 0), HEX_DBL(-, 1, 667790a09bf77, -, 8), + HEX_DBL(+, 1, ca230e0bea645, -, 61) }, + { HEX_DBL(+, 1, 00e8d2ff374a1, +, 0), HEX_DBL(-, 1, 4f4c90e9c4ead, -, 8), + HEX_DBL(+, 1, 1de3e7f350c1, -, 61) }, + { HEX_DBL(+, 1, 00d8b6da482ce, +, 0), HEX_DBL(-, 1, 3823052860649, -, 8), + HEX_DBL(+, 1, 5789b4c5891b8, -, 64) }, + { HEX_DBL(+, 1, 00c89cba71a8c, +, 0), HEX_DBL(-, 1, 20faed2dc9a9e, -, 8), + HEX_DBL(+, 1, 9e7c40f9839fd, -, 62) }, + { HEX_DBL(+, 1, 00b8849f52834, +, 0), HEX_DBL(-, 1, 09d448cb65014, -, 8), + HEX_DBL(+, 1, 387e3e9b6d02, -, 62) }, + { HEX_DBL(+, 1, 00a86e88899a4, +, 0), HEX_DBL(-, 1, e55e2fa53ebf1, -, 9), + HEX_DBL(+, 1, cdaa71fddfddf, -, 62) }, + { HEX_DBL(+, 1, 00985a75b5e3f, +, 0), HEX_DBL(-, 1, b716b429dce0f, -, 9), + HEX_DBL(+, 1, 2f2af081367bf, -, 63) }, + { HEX_DBL(+, 1, 00884866766ee, +, 0), HEX_DBL(-, 1, 88d21ec7a16d7, -, 9), + HEX_DBL(+, 1, fb95c228d6f16, -, 62) }, + { HEX_DBL(+, 1, 0078385a6a61d, +, 0), HEX_DBL(-, 1, 5a906f219a9e8, -, 9), + HEX_DBL(+, 1, 18aff10a89f29, -, 64) }, + { HEX_DBL(+, 1, 00682a5130fbe, +, 0), HEX_DBL(-, 1, 2c51a4dae87f1, -, 9), + HEX_DBL(+, 1, bcc7e33ddde3, -, 63) }, + { HEX_DBL(+, 1, 00581e4a69944, +, 0), HEX_DBL(-, 1, fc2b7f2d782b1, -, 10), + HEX_DBL(+, 1, fe3ef3300a9fa, -, 64) }, + { HEX_DBL(+, 1, 00481445b39a8, +, 0), HEX_DBL(-, 1, 9fb97df0b0b83, -, 10), + HEX_DBL(+, 1, 0d9a601f2f324, -, 65) }, + { HEX_DBL(+, 1, 00380c42ae963, +, 0), HEX_DBL(-, 1, 434d4546227ae, -, 10), + HEX_DBL(+, 1, 0b9b6a5868f33, -, 63) }, + { HEX_DBL(+, 1, 00280640fa271, +, 0), HEX_DBL(-, 1, cdcda8e930c19, -, 11), + HEX_DBL(+, 1, 3d424ab39f789, -, 64) }, + { HEX_DBL(+, 1, 0018024036051, +, 0), HEX_DBL(-, 1, 150c558601261, -, 11), + HEX_DBL(+, 1, 285bb90327a0f, -, 64) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, ffa011fca0a1e, -, 1), HEX_DBL(+, 1, 14e5640c4197b, -, 10), + HEX_DBL(+, 1, 95728136ae401, -, 63) }, + { HEX_DBL(+, 1, ff6031f064e07, -, 1), HEX_DBL(+, 1, cd61806bf532d, -, 10), + HEX_DBL(+, 1, 568a4f35d8538, -, 63) }, + { HEX_DBL(+, 1, ff2061d532b9c, -, 1), HEX_DBL(+, 1, 42e34af550eda, -, 9), + HEX_DBL(+, 1, 8f69cee55fec, -, 62) }, + { HEX_DBL(+, 1, fee0a1a513253, -, 1), HEX_DBL(+, 1, 9f0a5523902ea, -, 9), + HEX_DBL(+, 1, daec734b11615, -, 63) }, + { HEX_DBL(+, 1, fea0f15a12139, -, 1), HEX_DBL(+, 1, fb25e19f11b26, -, 9), + HEX_DBL(+, 1, 8bafca62941da, -, 62) }, + { HEX_DBL(+, 1, fe6150ee3e6d4, -, 1), HEX_DBL(+, 1, 2b9af9a28e282, -, 8), + HEX_DBL(+, 1, 0fd3674e1dc5b, -, 61) }, + { HEX_DBL(+, 1, fe21c05baa109, -, 1), HEX_DBL(+, 1, 599d4678f24b9, -, 8), + HEX_DBL(+, 1, dafce1f09937b, -, 61) }, + { HEX_DBL(+, 1, fde23f9c69cf9, -, 1), HEX_DBL(+, 1, 8799d8c046eb, -, 8), + HEX_DBL(+, 1, ffa0ce0bdd217, -, 65) }, + { HEX_DBL(+, 1, fda2ceaa956e8, -, 1), HEX_DBL(+, 1, b590b1e5951ee, -, 8), + HEX_DBL(+, 1, 645a769232446, -, 62) }, + { HEX_DBL(+, 1, fd636d8047a1f, -, 1), HEX_DBL(+, 1, e381d3555dbcf, -, 8), + HEX_DBL(+, 1, 882320d368331, -, 61) }, + { HEX_DBL(+, 1, fd241c179e0cc, -, 1), HEX_DBL(+, 1, 08b69f3dccde, -, 7), + HEX_DBL(+, 1, 01ad5065aba9e, -, 61) }, + { HEX_DBL(+, 1, fce4da6ab93e8, -, 1), HEX_DBL(+, 1, 1fa97a61dd298, -, 7), + HEX_DBL(+, 1, 84cd1f931ae34, -, 60) }, + { HEX_DBL(+, 1, fca5a873bcb19, -, 1), HEX_DBL(+, 1, 36997bcc54a3f, -, 7), + HEX_DBL(+, 1, 1485e97eaee03, -, 60) }, + { HEX_DBL(+, 1, fc66862ccec93, -, 1), HEX_DBL(+, 1, 4d86a43264a4f, -, 7), + HEX_DBL(+, 1, c75e63370988b, -, 61) }, + { HEX_DBL(+, 1, fc27739018cfe, -, 1), HEX_DBL(+, 1, 6470f448fb09d, -, 7), + HEX_DBL(+, 1, d7361eeaed0a1, -, 65) }, + { HEX_DBL(+, 1, fbe87097c6f5a, -, 1), HEX_DBL(+, 1, 7b586cc4c2523, -, 7), + HEX_DBL(+, 1, b3df952cc473c, -, 61) }, + { HEX_DBL(+, 1, fba97d3e084dd, -, 1), HEX_DBL(+, 1, 923d0e5a21e06, -, 7), + HEX_DBL(+, 1, cf56c7b64ae5d, -, 62) }, + { HEX_DBL(+, 1, fb6a997d0ecdc, -, 1), HEX_DBL(+, 1, a91ed9bd3df9a, -, 7), + HEX_DBL(+, 1, b957bdcd89e43, -, 61) }, + { HEX_DBL(+, 1, fb2bc54f0f4ab, -, 1), HEX_DBL(+, 1, bffdcfa1f7fbb, -, 7), + HEX_DBL(+, 1, ea8cad9a21771, -, 62) }, + { HEX_DBL(+, 1, faed00ae41783, -, 1), HEX_DBL(+, 1, d6d9f0bbee6f6, -, 7), + HEX_DBL(+, 1, 5762a9af89c82, -, 60) }, + { HEX_DBL(+, 1, faae4b94dfe64, -, 1), HEX_DBL(+, 1, edb33dbe7d335, -, 7), + HEX_DBL(+, 1, 21e24fc245697, -, 62) }, + { HEX_DBL(+, 1, fa6fa5fd27ff8, -, 1), HEX_DBL(+, 1, 0244dbae5ed05, -, 6), + HEX_DBL(+, 1, 12ef51b967102, -, 60) }, + { HEX_DBL(+, 1, fa310fe15a078, -, 1), HEX_DBL(+, 1, 0daeaf24c3529, -, 6), + HEX_DBL(+, 1, 10d3cfca60b45, -, 59) }, + { HEX_DBL(+, 1, f9f2893bb9192, -, 1), HEX_DBL(+, 1, 1917199bb66bc, -, 6), + HEX_DBL(+, 1, 6cf6034c32e19, -, 60) }, + { HEX_DBL(+, 1, f9b412068b247, -, 1), HEX_DBL(+, 1, 247e1b6c615d5, -, 6), + HEX_DBL(+, 1, 42f0fffa229f7, -, 61) }, + { HEX_DBL(+, 1, f975aa3c18ed6, -, 1), HEX_DBL(+, 1, 2fe3b4efcc5ad, -, 6), + HEX_DBL(+, 1, 70106136a8919, -, 60) }, + { HEX_DBL(+, 1, f93751d6ae09b, -, 1), HEX_DBL(+, 1, 3b47e67edea93, -, 6), + HEX_DBL(+, 1, 38dd5a4f6959a, -, 59) }, + { HEX_DBL(+, 1, f8f908d098df6, -, 1), HEX_DBL(+, 1, 46aab0725ea6c, -, 6), + HEX_DBL(+, 1, 821fc1e799e01, -, 60) }, + { HEX_DBL(+, 1, f8bacf242aa2c, -, 1), HEX_DBL(+, 1, 520c1322f1e4e, -, 6), + HEX_DBL(+, 1, 129dcda3ad563, -, 60) }, + { HEX_DBL(+, 1, f87ca4cbb755, -, 1), HEX_DBL(+, 1, 5d6c0ee91d2ab, -, 6), + HEX_DBL(+, 1, c5b190c04606e, -, 62) }, + { HEX_DBL(+, 1, f83e89c195c25, -, 1), HEX_DBL(+, 1, 68caa41d448c3, -, 6), + HEX_DBL(+, 1, 4723441195ac9, -, 59) } }; static double __loglTable3[8][3] = { -{HEX_DBL( +, 1, 000e00c40ab89, +, 0 ), HEX_DBL( -, 1, 4332be0032168, -, 12 ), HEX_DBL( +, 1, a1003588d217a, -, 65 )}, -{HEX_DBL( +, 1, 000a006403e82, +, 0 ), HEX_DBL( -, 1, cdb2987366fcc, -, 13 ), HEX_DBL( +, 1, 5c86001294bbc, -, 67 )}, -{HEX_DBL( +, 1, 0006002400d8, +, 0 ), HEX_DBL( -, 1, 150297c90fa6f, -, 13 ), HEX_DBL( +, 1, 01fb4865fae32, -, 66 )}, -{HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 )}, -{HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 ), HEX_DBL( +, 0, 0, +, 0 )}, -{HEX_DBL( +, 1, ffe8011ff280a, -, 1 ), HEX_DBL( +, 1, 14f8daf5e3d3b, -, 12 ), HEX_DBL( +, 1, 3c933b4b6b914, -, 68 )}, -{HEX_DBL( +, 1, ffd8031fc184e, -, 1 ), HEX_DBL( +, 1, cd978c38042bb, -, 12 ), HEX_DBL( +, 1, 10f8e642e66fd, -, 65 )}, -{HEX_DBL( +, 1, ffc8061f5492b, -, 1 ), HEX_DBL( +, 1, 43183c878274e, -, 11 ), HEX_DBL( +, 1, 5885dd1eb6582, -, 65 )} + { HEX_DBL(+, 1, 000e00c40ab89, +, 0), HEX_DBL(-, 1, 4332be0032168, -, 12), + HEX_DBL(+, 1, a1003588d217a, -, 65) }, + { HEX_DBL(+, 1, 000a006403e82, +, 0), HEX_DBL(-, 1, cdb2987366fcc, -, 13), + HEX_DBL(+, 1, 5c86001294bbc, -, 67) }, + { HEX_DBL(+, 1, 0006002400d8, +, 0), HEX_DBL(-, 1, 150297c90fa6f, -, 13), + HEX_DBL(+, 1, 01fb4865fae32, -, 66) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, 0, +, 0), HEX_DBL(+, 0, 0, +, 0), HEX_DBL(+, 0, 0, +, 0) }, + { HEX_DBL(+, 1, ffe8011ff280a, -, 1), HEX_DBL(+, 1, 14f8daf5e3d3b, -, 12), + HEX_DBL(+, 1, 3c933b4b6b914, -, 68) }, + { HEX_DBL(+, 1, ffd8031fc184e, -, 1), HEX_DBL(+, 1, cd978c38042bb, -, 12), + HEX_DBL(+, 1, 10f8e642e66fd, -, 65) }, + { HEX_DBL(+, 1, ffc8061f5492b, -, 1), HEX_DBL(+, 1, 43183c878274e, -, 11), + HEX_DBL(+, 1, 5885dd1eb6582, -, 65) } }; static void __log2_ep(double *hi, double *lo, double x) { - union { uint64_t i; double d; } uu; + union { + uint64_t i; + double d; + } uu; int m; double f = reference_frexp(x, &m); // bring f in [0.75, 1.5) - if( f < 0.75 ) { + if (f < 0.75) + { f *= 2.0; m -= 1; } // index first table .... brings down to [1-2^-7, 1+2^6) uu.d = f; - int index = (int) (((uu.i + ((uint64_t) 1 << 51)) & 0x000fc00000000000ULL) >> 46); + int index = + (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46); double r1 = __loglTable1[index][0]; double logr1hi = __loglTable1[index][1]; double logr1lo = __loglTable1[index][2]; - // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since |m| <= 1023 - // which needs 10bits at max, we can directly add m to log1hi without spilling + // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since + // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi + // without spilling logr1hi += m; - // argument reduction needs to be in double-double since reduced argument will form the - // leading term of polynomial approximation which sets the precision we eventually achieve + // argument reduction needs to be in double-double since reduced argument + // will form the leading term of polynomial approximation which sets the + // precision we eventually achieve double zhi, zlo; MulD(&zhi, &zlo, r1, uu.d); // second index table .... brings down to [1-2^-12, 1+2^-11) uu.d = zhi; - index = (int) (((uu.i + ((uint64_t) 1 << 46)) & 0x00007e0000000000ULL) >> 41); + index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41); double r2 = __loglTable2[index][0]; double logr2hi = __loglTable2[index][1]; double logr2lo = __loglTable2[index][2]; @@ -4186,11 +4754,12 @@ static void __log2_ep(double *hi, double *lo, double x) // Actually reduction to 2^-11 would have been sufficient to calculate // second order term in polynomial in double rather than double-double, I // reduced it a bit more to make sure other systematic arithmetic errors - // are guarded against .... also this allow lower order product of leading polynomial - // term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather than double-double ... - // hence only term that needs to be done in double-double is Ao_hi*z_hi + // are guarded against .... also this allow lower order product of leading + // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather + // than double-double ... hence only term that needs to be done in + // double-double is Ao_hi*z_hi uu.d = zhi; - index = (int) (((uu.i + ((uint64_t) 1 << 41)) & 0x0000038000000000ULL) >> 39); + index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39); double r3 = __loglTable3[index][0]; double logr3hi = __loglTable3[index][1]; double logr3lo = __loglTable3[index][2]; @@ -4202,34 +4771,36 @@ static void __log2_ep(double *hi, double *lo, double x) AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo); AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo); - // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after this + // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after + // this MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0); - // we dont need to do full double-double substract here. substracting 1.0 for higher - // term is exact + // we dont need to do full double-double substract here. substracting 1.0 + // for higher term is exact zhi = zhi - 1.0; // normalize AddD(&zhi, &zlo, zhi, zlo); // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit - // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in [-2^-14, 2^-13] - // log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4 + // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in + // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4 // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5 - // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 + A4*z^3) - // since we are looking for at least 64 digits of precision and z in [-2^-14, 2^-13], final term - // can be done in double .... also Aolo*zhi + Aohi*zlo can be done in double .... - // Aohi*zhi needs to be done in double-double + // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 + + // A4*z^3) since we are looking for at least 64 digits of precision and z in + // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi + + // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in + // double-double - double Aohi = HEX_DBL( +, 1, 71547652b82fe, +, 0 ); - double Aolo = HEX_DBL( +, 1, 777c9cbb675c, -, 56 ); + double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0); + double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56); double y; - y = HEX_DBL( +, 1, 276d2736fade7, -, 2 ); - y = HEX_DBL( -, 1, 7154765782df1, -, 2 ) + y*zhi; - y = HEX_DBL( +, 1, ec709dc3a0f67, -, 2 ) + y*zhi; - y = HEX_DBL( -, 1, 71547652b82fe, -, 1 ) + y*zhi; - double zhisq = zhi*zhi; - y = y*zhisq; - y = y + zhi*Aolo; - y = y + zlo*Aohi; + y = HEX_DBL(+, 1, 276d2736fade7, -, 2); + y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi; + y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi; + y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi; + double zhisq = zhi * zhi; + y = y * zhisq; + y = y + zhi * Aolo; + y = y + zlo * Aohi; MulD(&zhi, &zlo, Aohi, zhi); AddDD(&zhi, &zlo, zhi, zlo, y, 0.0); @@ -4239,7 +4810,7 @@ static void __log2_ep(double *hi, double *lo, double x) *lo = zlo; } -long double reference_powl( long double x, long double y ) +long double reference_powl(long double x, long double y) { @@ -4256,174 +4827,166 @@ long double reference_powl( long double x, long double y ) // causes errors. So we need to tread y as long double and convert it // to hi, lo doubles when performing y*log2(x). -// double x = (double) xx; -// double y = (double) yy; + // double x = (double) xx; + // double y = (double) yy; - static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 ); + static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53); - //if x = 1, return x for any y, even NaN - if( x == 1.0 ) - return x; + // if x = 1, return x for any y, even NaN + if (x == 1.0) return x; - //if y == 0, return 1 for any x, even NaN - if( y == 0.0 ) - return 1.0L; + // if y == 0, return 1 for any x, even NaN + if (y == 0.0) return 1.0L; - //get NaNs out of the way - if( x != x || y != y ) - return x + y; + // get NaNs out of the way + if (x != x || y != y) return x + y; - //do the work required to sort out edge cases - double fabsy = reference_fabs( y ); - double fabsx = reference_fabs( x ); - double iy = reference_rint( fabsy ); //we do round to nearest here so that |fy| <= 0.5 - if( iy > fabsy )//convert nearbyint to floor + // do the work required to sort out edge cases + double fabsy = reference_fabs(y); + double fabsx = reference_fabs(x); + double iy = reference_rint( + fabsy); // we do round to nearest here so that |fy| <= 0.5 + if (iy > fabsy) // convert nearbyint to floor iy -= 1.0; int isOddInt = 0; - if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon ) - isOddInt = (int) (iy - 2.0 * rint( 0.5 * iy )); //might be 0, -1, or 1 + if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon) + isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1 - ///test a few more edge cases - //deal with x == 0 cases - if( x == 0.0 ) + /// test a few more edge cases + // deal with x == 0 cases + if (x == 0.0) { - if( ! isOddInt ) - x = 0.0; + if (!isOddInt) x = 0.0; - if( y < 0 ) - x = 1.0/ x; + if (y < 0) x = 1.0 / x; return x; } - //x == +-Inf cases - if( isinf(fabsx) ) + // x == +-Inf cases + if (isinf(fabsx)) { - if( x < 0 ) + if (x < 0) { - if( isOddInt ) + if (isOddInt) { - if( y < 0 ) + if (y < 0) return -0.0; else return -INFINITY; } else { - if( y < 0 ) + if (y < 0) return 0.0; else return INFINITY; } } - if( y < 0 ) - return 0; + if (y < 0) return 0; return INFINITY; } - //y = +-inf cases - if( isinf(fabsy) ) + // y = +-inf cases + if (isinf(fabsy)) { - if( x == -1 ) - return 1; + if (x == -1) return 1; - if( y < 0 ) + if (y < 0) { - if( fabsx < 1 ) - return INFINITY; + if (fabsx < 1) return INFINITY; return 0; } - if( fabsx < 1 ) - return 0; + if (fabsx < 1) return 0; return INFINITY; } // x < 0 and y non integer case - if( x < 0 && iy != fabsy ) + if (x < 0 && iy != fabsy) { - //return nan; + // return nan; return cl_make_nan(); } - //speedy resolution of sqrt and reciprocal sqrt - if( fabsy == 0.5 ) + // speedy resolution of sqrt and reciprocal sqrt + if (fabsy == 0.5) { - long double xl = sqrtl( x ); - if( y < 0 ) - xl = 1.0/ xl; + long double xl = sqrtl(x); + if (y < 0) xl = 1.0 / xl; return xl; } double log2x_hi, log2x_lo; - // extended precision log .... accurate to at least 64-bits + couple of guard bits + // extended precision log .... accurate to at least 64-bits + couple of + // guard bits __log2_ep(&log2x_hi, &log2x_lo, fabsx); double ylog2x_hi, ylog2x_lo; - double y_hi = (double) y; - double y_lo = (double) ( y - (long double) y_hi); + double y_hi = (double)y; + double y_lo = (double)(y - (long double)y_hi); // compute product of y*log2(x) // scale to avoid overflow in double-double multiplication - if( reference_fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) { + if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970)) + { y_hi = reference_ldexp(y_hi, -53); y_lo = reference_ldexp(y_lo, -53); } MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo); - if( fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) { + if (fabs(y) > HEX_DBL(+, 1, 0, +, 970)) + { ylog2x_hi = reference_ldexp(ylog2x_hi, 53); ylog2x_lo = reference_ldexp(ylog2x_lo, 53); } long double powxy; - if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) { - powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY; - } else { + if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) + { + powxy = + reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY; + } + else + { // separate integer + fractional part long int m = lrint(ylog2x_hi); AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0); // revert to long double arithemtic - long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo; - long double tmp = reference_exp2l( ylog2x ); + long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo; + long double tmp = reference_exp2l(ylog2x); powxy = reference_scalblnl(tmp, m); } // if y is odd integer and x is negative, reverse sign - if( isOddInt & reference_signbit(x)) - powxy = -powxy; + if (isOddInt & reference_signbit(x)) powxy = -powxy; return powxy; } double reference_nextafter(double xx, double yy) { - float x = (float) xx; - float y = (float) yy; + float x = (float)xx; + float y = (float)yy; // take care of nans - if( x != x ) - return x; + if (x != x) return x; - if( y != y ) - return y; + if (y != y) return y; - if( x == y ) - return y; + if (x == y) return y; int32f_t a, b; - a.f = x; - b.f = y; + a.f = x; + b.f = y; - if( a.i & 0x80000000 ) - a.i = 0x80000000 - a.i; - if(b.i & 0x80000000 ) - b.i = 0x80000000 - b.i; + if (a.i & 0x80000000) a.i = 0x80000000 - a.i; + if (b.i & 0x80000000) b.i = 0x80000000 - b.i; a.i += (a.i < b.i) ? 1 : -1; - a.i = (a.i < 0) ? (cl_int) 0x80000000 - a.i : a.i; + a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i; return a.f; } @@ -4431,33 +4994,28 @@ double reference_nextafter(double xx, double yy) long double reference_nextafterl(long double xx, long double yy) { - double x = (double) xx; - double y = (double) yy; + double x = (double)xx; + double y = (double)yy; // take care of nans - if( x != x ) - return x; + if (x != x) return x; - if( y != y ) - return y; + if (y != y) return y; int64d_t a, b; - a.d = x; - b.d = y; + a.d = x; + b.d = y; int64_t tmp = 0x8000000000000000LL; - if( a.l & tmp ) - a.l = tmp - a.l; - if(b.l & tmp ) - b.l = tmp - b.l; + if (a.l & tmp) a.l = tmp - a.l; + if (b.l & tmp) b.l = tmp - b.l; - // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y = 0.0f) - // test needs to be done using integer rep because - // subnormals may be flushed to zero on some platforms - if( a.l == b.l ) - return y; + // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y = + // 0.0f) test needs to be done using integer rep because subnormals may be + // flushed to zero on some platforms + if (a.l == b.l) return y; a.l += (a.l < b.l) ? 1 : -1; a.l = (a.l < 0) ? tmp - a.l : a.l; @@ -4467,112 +5025,110 @@ long double reference_nextafterl(long double xx, long double yy) double reference_fdim(double xx, double yy) { - float x = (float) xx; - float y = (float) yy; + float x = (float)xx; + float y = (float)yy; - if( x != x ) - return x; + if (x != x) return x; - if( y != y ) - return y; + if (y != y) return y; - float r = ( x > y ) ? (float) reference_subtract( x, y) : 0.0f; + float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f; return r; - } long double reference_fdiml(long double xx, long double yy) { - double x = (double) xx; - double y = (double) yy; + double x = (double)xx; + double y = (double)yy; - if( x != x ) - return x; + if (x != x) return x; - if( y != y ) - return y; + if (y != y) return y; - double r = ( x > y ) ? (double) reference_subtractl(x, y) : 0.0; + double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0; return r; } double reference_remquo(double xd, double yd, int *n) { - float xx = (float) xd; - float yy = (float) yd; + float xx = (float)xd; + float yy = (float)yd; - if( isnan(xx) || isnan(yy) || - fabsf(xx) == INFINITY || - yy == 0.0 ) + if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0) { *n = 0; return cl_make_nan(); } - if( fabsf(yy) == INFINITY || xx == 0.0f ) { + if (fabsf(yy) == INFINITY || xx == 0.0f) + { *n = 0; return xd; } - if( fabsf(xx) == fabsf(yy) ) { + if (fabsf(xx) == fabsf(yy)) + { *n = (xx == yy) ? 1 : -1; - return reference_signbit( xx ) ? -0.0 : 0.0; + return reference_signbit(xx) ? -0.0 : 0.0; } - int signx = reference_signbit( xx ) ? -1 : 1; - int signy = reference_signbit( yy ) ? -1 : 1; + int signx = reference_signbit(xx) ? -1 : 1; + int signy = reference_signbit(yy) ? -1 : 1; int signn = (signx == signy) ? 1 : -1; float x = fabsf(xx); float y = fabsf(yy); int ex, ey; - ex = reference_ilogb( x ); - ey = reference_ilogb( y ); + ex = reference_ilogb(x); + ey = reference_ilogb(y); float xr = x; float yr = y; uint32_t q = 0; - if(ex-ey >= -1) { + if (ex - ey >= -1) + { - yr = (float) reference_ldexp( y, -ey ); - xr = (float) reference_ldexp( x, -ex ); + yr = (float)reference_ldexp(y, -ey); + xr = (float)reference_ldexp(x, -ex); - if(ex-ey >= 0) { + if (ex - ey >= 0) + { int i; - for(i = ex-ey; i > 0; i--) { + for (i = ex - ey; i > 0; i--) + { q <<= 1; - if(xr >= yr) { + if (xr >= yr) + { xr -= yr; q += 1; } xr += xr; } q <<= 1; - if( xr > yr ) { + if (xr > yr) + { xr -= yr; q += 1; } } - else //ex-ey = -1 - xr = reference_ldexp(xr, ex-ey); + else // ex-ey = -1 + xr = reference_ldexp(xr, ex - ey); } - if( (yr < 2.0f*xr) || ( (yr == 2.0f*xr) && (q & 0x00000001) ) ) { + if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001))) + { xr -= yr; q += 1; } - if(ex-ey >= -1) - xr = reference_ldexp(xr, ey); + if (ex - ey >= -1) xr = reference_ldexp(xr, ey); int qout = q & 0x0000007f; - if( signn < 0) - qout = -qout; - if( xx < 0.0 ) - xr = -xr; + if (signn < 0) qout = -qout; + if (xx < 0.0) xr = -xr; *n = qout; @@ -4582,79 +5138,82 @@ double reference_remquo(double xd, double yd, int *n) long double reference_remquol(long double xd, long double yd, int *n) { - double xx = (double) xd; - double yy = (double) yd; + double xx = (double)xd; + double yy = (double)yd; - if( isnan(xx) || isnan(yy) || - fabs(xx) == INFINITY || - yy == 0.0 ) + if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0) { *n = 0; return cl_make_nan(); } - if( reference_fabs(yy) == INFINITY || xx == 0.0 ) { + if (reference_fabs(yy) == INFINITY || xx == 0.0) + { *n = 0; return xd; } - if( reference_fabs(xx) == reference_fabs(yy) ) { + if (reference_fabs(xx) == reference_fabs(yy)) + { *n = (xx == yy) ? 1 : -1; - return reference_signbit( xx ) ? -0.0 : 0.0; + return reference_signbit(xx) ? -0.0 : 0.0; } - int signx = reference_signbit( xx ) ? -1 : 1; - int signy = reference_signbit( yy ) ? -1 : 1; + int signx = reference_signbit(xx) ? -1 : 1; + int signy = reference_signbit(yy) ? -1 : 1; int signn = (signx == signy) ? 1 : -1; double x = reference_fabs(xx); double y = reference_fabs(yy); int ex, ey; - ex = reference_ilogbl( x ); - ey = reference_ilogbl( y ); + ex = reference_ilogbl(x); + ey = reference_ilogbl(y); double xr = x; double yr = y; uint32_t q = 0; - if(ex-ey >= -1) { + if (ex - ey >= -1) + { - yr = reference_ldexp( y, -ey ); - xr = reference_ldexp( x, -ex ); + yr = reference_ldexp(y, -ey); + xr = reference_ldexp(x, -ex); int i; - if(ex-ey >= 0) { + if (ex - ey >= 0) + { - for(i = ex-ey; i > 0; i--) { + for (i = ex - ey; i > 0; i--) + { q <<= 1; - if(xr >= yr) { + if (xr >= yr) + { xr -= yr; q += 1; } xr += xr; } q <<= 1; - if( xr > yr ) { + if (xr > yr) + { xr -= yr; q += 1; } } else - xr = reference_ldexp(xr, ex-ey); + xr = reference_ldexp(xr, ex - ey); } - if( (yr < 2.0*xr) || ( (yr == 2.0*xr) && (q & 0x00000001) ) ) { + if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001))) + { xr -= yr; q += 1; } - if(ex-ey >= -1) - xr = reference_ldexp(xr, ey); + if (ex - ey >= -1) xr = reference_ldexp(xr, ey); int qout = q & 0x0000007f; - if( signn < 0) - qout = -qout; - if( xx < 0.0 ) - xr = -xr; + if (signn < 0) qout = -qout; + if (xx < 0.0) xr = -xr; *n = qout; return xr; @@ -4662,27 +5221,27 @@ long double reference_remquol(long double xd, long double yd, int *n) static double reference_scalbn(double x, int n) { - if(reference_isinf(x) || reference_isnan(x) || x == 0.0) - return x; + if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x; int bias = 1023; - union { double d; cl_long l; } u; - u.d = (double) x; + union { + double d; + cl_long l; + } u; + u.d = (double)x; int e = (int)((u.l & 0x7ff0000000000000LL) >> 52); - if(e == 0) + if (e == 0) { u.l |= ((cl_long)1023 << 52); u.d -= 1.0; e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022; } e += n; - if(e >= 2047 || n >= 2098 ) - return reference_copysign(INFINITY, x); - if(e < -51 || n <-2097 ) - return reference_copysign(0.0, x); - if(e <= 0) + if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x); + if (e < -51 || n < -2097) return reference_copysign(0.0, x); + if (e <= 0) { - bias += (e-1); + bias += (e - 1); e = 1; } u.l &= 0x800fffffffffffffLL; @@ -4695,26 +5254,26 @@ static double reference_scalbn(double x, int n) static long double reference_scalblnl(long double x, long n) { #if defined(__i386__) || defined(__x86_64__) // INTEL - union - { + union { long double d; - struct{ cl_ulong m; cl_ushort sexp;}u; - }u; + struct + { + cl_ulong m; + cl_ushort sexp; + } u; + } u; u.u.m = CL_LONG_MIN; - if ( reference_isinf(x) ) - return x; + if (reference_isinf(x)) return x; - if( x == 0.0L || n < -2200) - return reference_copysignl( 0.0L, x ); + if (x == 0.0L || n < -2200) return reference_copysignl(0.0L, x); - if( n > 2200 ) - return reference_copysignl( INFINITY, x ); + if (n > 2200) return reference_copysignl(INFINITY, x); - if( n < 0 ) + if (n < 0) { u.u.sexp = 0x3fff - 1022; - while( n <= -1022 ) + while (n <= -1022) { x *= u.d; n += 1022; @@ -4724,10 +5283,10 @@ static long double reference_scalblnl(long double x, long n) return x; } - if( n > 0 ) + if (n > 0) { u.u.sexp = 0x3fff + 1023; - while( n >= 1023 ) + while (n >= 1023) { x *= u.d; n -= 1023; @@ -4742,27 +5301,27 @@ static long double reference_scalblnl(long double x, long n) #elif defined(__arm__) // ARM .. sizeof(long double) == sizeof(double) #if __DBL_MAX_EXP__ >= __LDBL_MAX_EXP__ - if(reference_isinfl(x) || reference_isnanl(x)) - return x; + if (reference_isinfl(x) || reference_isnanl(x)) return x; int bias = 1023; - union { double d; cl_long l; } u; - u.d = (double) x; + union { + double d; + cl_long l; + } u; + u.d = (double)x; int e = (int)((u.l & 0x7ff0000000000000LL) >> 52); - if(e == 0) + if (e == 0) { u.l |= ((cl_long)1023 << 52); u.d -= 1.0; e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022; } e += n; - if(e >= 2047) - return reference_copysignl(INFINITY, x); - if(e < -51) - return reference_copysignl(0.0, x); - if(e <= 0) + if (e >= 2047) return reference_copysignl(INFINITY, x); + if (e < -51) return reference_copysignl(0.0, x); + if (e <= 0) { - bias += (e-1); + bias += (e - 1); e = 1; } u.l &= 0x800fffffffffffffLL; @@ -4772,284 +5331,259 @@ static long double reference_scalblnl(long double x, long n) return x * u.d; #endif -#else // PPC +#else // PPC return scalblnl(x, n); #endif } -double reference_relaxed_exp( double x ) -{ - return reference_exp(x); -} +double reference_relaxed_exp(double x) { return reference_exp(x); } double reference_exp(double x) { - return reference_exp2( x * HEX_DBL( +, 1, 71547652b82fe, +, 0 ) ); + return reference_exp2(x * HEX_DBL(+, 1, 71547652b82fe, +, 0)); } long double reference_expl(long double x) { #if defined(__PPC__) - long double scale, bias; + long double scale, bias; - // The PPC double long version of expl fails to produce denorm results - // and instead generates a 0.0. Compensate for this limitation by - // computing expl as: - // expl(x + 40) * expl(-40) - // Likewise, overflows can prematurely produce an infinity, so we - // compute expl as: - // expl(x - 40) * expl(40) - scale = 1.0L; - bias = 0.0L; - if (x < -708.0L) { - bias = 40.0; - scale = expl(-40.0L); - } else if (x > 708.0L) { - bias = -40.0L; - scale = expl(40.0L); - } - return expl(x + bias) * scale; + // The PPC double long version of expl fails to produce denorm results + // and instead generates a 0.0. Compensate for this limitation by + // computing expl as: + // expl(x + 40) * expl(-40) + // Likewise, overflows can prematurely produce an infinity, so we + // compute expl as: + // expl(x - 40) * expl(40) + scale = 1.0L; + bias = 0.0L; + if (x < -708.0L) + { + bias = 40.0; + scale = expl(-40.0L); + } + else if (x > 708.0L) + { + bias = -40.0L; + scale = expl(40.0L); + } + return expl(x + bias) * scale; #else - return expl( x ); + return expl(x); #endif } -double reference_sinh(double x) -{ - return sinh(x); -} +double reference_sinh(double x) { return sinh(x); } -long double reference_sinhl(long double x) -{ - return sinhl(x); -} +long double reference_sinhl(long double x) { return sinhl(x); } double reference_fmod(double x, double y) { - if( x == 0.0 && fabs(y) > 0.0 ) - return x; + if (x == 0.0 && fabs(y) > 0.0) return x; - if( fabs(x) == INFINITY || y == 0 ) - return cl_make_nan(); + if (fabs(x) == INFINITY || y == 0) return cl_make_nan(); - if( fabs(y) == INFINITY ) // we know x is finite from above + if (fabs(y) == INFINITY) // we know x is finite from above return x; #if defined(_MSC_VER) && defined(_M_X64) - return fmod( x, y ); + return fmod(x, y); #else - return fmodf( (float) x, (float) y ); + return fmodf((float)x, (float)y); #endif } long double reference_fmodl(long double x, long double y) { - if( x == 0.0L && fabsl(y) > 0.0L ) + if (x == 0.0L && fabsl(y) > 0.0L) return x; + + if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan(); + + if (fabsl(y) == INFINITY) // we know x is finite from above return x; - if( fabsl(x) == INFINITY || y == 0.0L ) - return cl_make_nan(); - - if( fabsl(y) == INFINITY ) // we know x is finite from above - return x; - - return fmod( (double) x, (double) y ); + return fmod((double)x, (double)y); } double reference_modf(double x, double *n) { - if(isnan(x)) { + if (isnan(x)) + { *n = cl_make_nan(); return cl_make_nan(); } float nr; - float yr = modff((float) x, &nr); + float yr = modff((float)x, &nr); *n = nr; return yr; } long double reference_modfl(long double x, long double *n) { - if(isnan(x)) { + if (isnan(x)) + { *n = cl_make_nan(); return cl_make_nan(); } double nr; - double yr = modf((double) x, &nr); + double yr = modf((double)x, &nr); *n = nr; return yr; } -long double reference_fractl(long double x, long double *ip ) +long double reference_fractl(long double x, long double *ip) { - if(isnan(x)) { + if (isnan(x)) + { *ip = cl_make_nan(); return cl_make_nan(); } double i; - double f = modf((double) x, &i ); - if( f < 0.0 ) + double f = modf((double)x, &i); + if (f < 0.0) { f = 1.0 + f; i -= 1.0; - if( f == 1.0 ) - f = HEX_DBL( +, 1, fffffffffffff, -, 1 ); + if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1); } *ip = i; return f; } -long double reference_fabsl(long double x) -{ - return fabsl( x ); -} +long double reference_fabsl(long double x) { return fabsl(x); } -double reference_relaxed_log( double x ) +double reference_relaxed_log(double x) { - return (float)reference_log((float)x); + return (float)reference_log((float)x); } double reference_log(double x) { - if( x == 0.0 ) - return -INFINITY; + if (x == 0.0) return -INFINITY; - if( x < 0.0 ) - return cl_make_nan(); + if (x < 0.0) return cl_make_nan(); - if( isinf(x) ) - return INFINITY; + if (isinf(x)) return INFINITY; - double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ); + double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1); double logxHi, logxLo; __log2_ep(&logxHi, &logxLo, x); - return logxHi*log2Hi; + return logxHi * log2Hi; } long double reference_logl(long double x) { - if( x == 0.0 ) - return -INFINITY; + if (x == 0.0) return -INFINITY; - if( x < 0.0 ) - return cl_make_nan(); + if (x < 0.0) return cl_make_nan(); - if( isinf(x) ) - return INFINITY; + if (isinf(x)) return INFINITY; - double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ); - double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 ); + double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1); + double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56); double logxHi, logxLo; __log2_ep(&logxHi, &logxLo, x); - //double rhi, rlo; - //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); - //return (long double) rhi + (long double) rlo; + // double rhi, rlo; + // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); + // return (long double) rhi + (long double) rlo; - long double lg2 = (long double) log2Hi + (long double) log2Lo; - long double logx = (long double) logxHi + (long double) logxLo; - return logx*lg2; + long double lg2 = (long double)log2Hi + (long double)log2Lo; + long double logx = (long double)logxHi + (long double)logxLo; + return logx * lg2; } -double reference_relaxed_pow( double x, double y) { - return (float)reference_exp2( ((float)y) * (float)reference_log2((float)x)); -} - -double reference_pow( double x, double y ) +double reference_relaxed_pow(double x, double y) { - static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 ); + return (float)reference_exp2(((float)y) * (float)reference_log2((float)x)); +} - //if x = 1, return x for any y, even NaN - if( x == 1.0 ) - return x; +double reference_pow(double x, double y) +{ + static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53); - //if y == 0, return 1 for any x, even NaN - if( y == 0.0 ) - return 1.0; + // if x = 1, return x for any y, even NaN + if (x == 1.0) return x; - //get NaNs out of the way - if( x != x || y != y ) - return x + y; + // if y == 0, return 1 for any x, even NaN + if (y == 0.0) return 1.0; - //do the work required to sort out edge cases - double fabsy = reference_fabs( y ); - double fabsx = reference_fabs( x ); - double iy = reference_rint( fabsy ); //we do round to nearest here so that |fy| <= 0.5 - if( iy > fabsy )//convert nearbyint to floor + // get NaNs out of the way + if (x != x || y != y) return x + y; + + // do the work required to sort out edge cases + double fabsy = reference_fabs(y); + double fabsx = reference_fabs(x); + double iy = reference_rint( + fabsy); // we do round to nearest here so that |fy| <= 0.5 + if (iy > fabsy) // convert nearbyint to floor iy -= 1.0; int isOddInt = 0; - if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon ) - isOddInt = (int) (iy - 2.0 * rint( 0.5 * iy )); //might be 0, -1, or 1 + if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon) + isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1 - ///test a few more edge cases - //deal with x == 0 cases - if( x == 0.0 ) + /// test a few more edge cases + // deal with x == 0 cases + if (x == 0.0) { - if( ! isOddInt ) - x = 0.0; + if (!isOddInt) x = 0.0; - if( y < 0 ) - x = 1.0/ x; + if (y < 0) x = 1.0 / x; return x; } - //x == +-Inf cases - if( isinf(fabsx) ) + // x == +-Inf cases + if (isinf(fabsx)) { - if( x < 0 ) + if (x < 0) { - if( isOddInt ) + if (isOddInt) { - if( y < 0 ) + if (y < 0) return -0.0; else return -INFINITY; } else { - if( y < 0 ) + if (y < 0) return 0.0; else return INFINITY; } } - if( y < 0 ) - return 0; + if (y < 0) return 0; return INFINITY; } - //y = +-inf cases - if( isinf(fabsy) ) + // y = +-inf cases + if (isinf(fabsy)) { - if( x == -1 ) - return 1; + if (x == -1) return 1; - if( y < 0 ) + if (y < 0) { - if( fabsx < 1 ) - return INFINITY; + if (fabsx < 1) return INFINITY; return 0; } - if( fabsx < 1 ) - return 0; + if (fabsx < 1) return 0; return INFINITY; } // x < 0 and y non integer case - if( x < 0 && iy != fabsy ) + if (x < 0 && iy != fabsy) { - //return nan; + // return nan; return cl_make_nan(); } - //speedy resolution of sqrt and reciprocal sqrt - if( fabsy == 0.5 ) + // speedy resolution of sqrt and reciprocal sqrt + if (fabsy == 0.5) { - long double xl = reference_sqrt( x ); - if( y < 0 ) - xl = 1.0/ xl; + long double xl = reference_sqrt(x); + if (y < 0) xl = 1.0 / xl; return xl; } @@ -5060,73 +5594,55 @@ double reference_pow( double x, double y ) return isOddInt ? reference_copysignd(result, x) : result; } -double reference_sqrt(double x) -{ - return sqrt(x); -} +double reference_sqrt(double x) { return sqrt(x); } -double reference_floor(double x) -{ - return floorf((float) x); -} +double reference_floor(double x) { return floorf((float)x); } double reference_ldexp(double value, int exponent) { #ifdef __MINGW32__ -/* - * ==================================================== - * This function is from fdlibm: http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - if(!finite(value)||value==0.0) return value; - return scalbn(value,exponent); + /* + * ==================================================== + * This function is from fdlibm: http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + if (!finite(value) || value == 0.0) return value; + return scalbn(value, exponent); #else return reference_scalbn(value, exponent); #endif } -long double reference_ldexpl(long double x, int n) -{ - return ldexpl( x, n); -} +long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); } -long double reference_coshl(long double x) -{ - return coshl(x); -} +long double reference_coshl(long double x) { return coshl(x); } -double reference_ceil(double x) -{ - return ceilf((float) x); -} +double reference_ceil(double x) { return ceilf((float)x); } long double reference_ceill(long double x) { - if( x == 0.0 || reference_isinfl(x) || reference_isnanl(x) ) - return x; + if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x; long double absx = reference_fabsl(x); - if( absx >= HEX_LDBL( +, 1, 0, +, 52 ) ) - return x; + if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x; - if( absx < 1.0 ) + if (absx < 1.0) { - if( x < 0.0 ) + if (x < 0.0) return 0.0; else return 1.0; } - long double r = (long double) ((cl_long) x); + long double r = (long double)((cl_long)x); - if( x > 0.0 && r < x ) - r += 1.0; + if (x > 0.0 && r < x) r += 1.0; return r; } @@ -5137,45 +5653,53 @@ long double reference_acosl(long double x) long double x2 = x * x; int i; - //Prepare a head + tail representation of PI in long double. A good compiler should get rid of all of this work. - static const cl_ulong pi_bits[2] = { 0x3243F6A8885A308DULL, 0x313198A2E0370734ULL}; // first 126 bits of pi http://www.super-computing.org/pi-hexa_current.html + // Prepare a head + tail representation of PI in long double. A good + // compiler should get rid of all of this work. + static const cl_ulong pi_bits[2] = { + 0x3243F6A8885A308DULL, 0x313198A2E0370734ULL + }; // first 126 bits of pi + // http://www.super-computing.org/pi-hexa_current.html long double head, tail, temp; #if __LDBL_MANT_DIG__ >= 64 // long double has 64-bits of precision or greater - temp = (long double) pi_bits[0] * 0x1.0p64L; - head = temp + (long double) pi_bits[1]; - temp -= head; // rounding err rounding pi_bits[1] into head - tail = (long double) pi_bits[1] + temp; - head *= HEX_LDBL( +, 1, 0, -, 125 ); - tail *= HEX_LDBL( +, 1, 0, -, 125 ); + temp = (long double)pi_bits[0] * 0x1.0p64L; + head = temp + (long double)pi_bits[1]; + temp -= head; // rounding err rounding pi_bits[1] into head + tail = (long double)pi_bits[1] + temp; + head *= HEX_LDBL(+, 1, 0, -, 125); + tail *= HEX_LDBL(+, 1, 0, -, 125); #else - head = (long double) pi_bits[0]; - tail = (long double) ((cl_long) pi_bits[0] - (cl_long) head ); // residual part of pi_bits[0] after rounding - tail = tail * HEX_LDBL( +, 1, 0, +, 64 ) + (long double) pi_bits[1]; - head *= HEX_LDBL( +, 1, 0, -, 61 ); - tail *= HEX_LDBL( +, 1, 0, -, 125 ); + head = (long double)pi_bits[0]; + tail = + (long double)((cl_long)pi_bits[0] + - (cl_long) + head); // residual part of pi_bits[0] after rounding + tail = tail * HEX_LDBL(+, 1, 0, +, 64) + (long double)pi_bits[1]; + head *= HEX_LDBL(+, 1, 0, -, 61); + tail *= HEX_LDBL(+, 1, 0, -, 125); #endif // oversize values and NaNs go to NaN - if( ! (x2 <= 1.0) ) - return sqrtl(1.0L - x2 ); + if (!(x2 <= 1.0)) return sqrtl(1.0L - x2); // // deal with large |x|: // sqrt( 1 - x**2) - // acos(|x| > sqrt(0.5)) = 2 * atan( z ); z = -------------------- ; z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...] + // acos(|x| > sqrt(0.5)) = 2 * atan( z ); z = -------------------- ; + // z in [0, sqrt(0.5)/(1+sqrt(0.5) = .4142135...] // 1 + x - if( x2 > 0.5 ) + if (x2 > 0.5) { // we handle the x < 0 case as pi - acos(|x|) - long double sign = reference_copysignl( 1.0L, x ); - long double fabsx = reference_fabsl( x ); - head -= head * sign; // x > 0 ? 0 : pi.hi - tail -= tail * sign; // x > 0 ? 0 : pi.low + long double sign = reference_copysignl(1.0L, x); + long double fabsx = reference_fabsl(x); + head -= head * sign; // x > 0 ? 0 : pi.hi + tail -= tail * sign; // x > 0 ? 0 : pi.low - // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt( (1-x)/(1+x) ) - long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2 + // z = sqrt( 1-x**2 ) / (1+x) = sqrt( (1-x)(1+x) / (1+x)**2 ) = sqrt( + // (1-x)/(1+x) ) + long double z2 = (1.0L - fabsx) / (1.0L + fabsx); // z**2 long double z = sign * sqrtl(z2); // atan(sqrt(q)) @@ -5185,29 +5709,41 @@ long double reference_acosl(long double x) // Define q = r*r, and solve for atan(r): // // atan(r) = (p(r) + 1) * r = rp(r) + r - static long double atan_coeffs[] = { HEX_LDBL( -, b, 3f52e0c278293b3, -, 67 ), HEX_LDBL( -, a, aaaaaaaaaaa95b8, -, 5 ), - HEX_LDBL( +, c, ccccccccc992407, -, 6 ), HEX_LDBL( -, 9, 24924923024398, -, 6 ), - HEX_LDBL( +, e, 38e38d6f92c98f3, -, 7 ), HEX_LDBL( -, b, a2e89bfb8393ec6, -, 7 ), - HEX_LDBL( +, 9, d89a9f574d412cb, -, 7 ), HEX_LDBL( -, 8, 88580517884c547, -, 7 ), - HEX_LDBL( +, f, 0ab6756abdad408, -, 8 ), HEX_LDBL( -, d, 56a5b07a2f15b49, -, 8 ), - HEX_LDBL( +, b, 72ab587e46d80b2, -, 8 ), HEX_LDBL( -, 8, 62ea24bb5b2e636, -, 8 ), - HEX_LDBL( +, e, d67c16582123937, -, 10 ) }; // minimax fit over [ 0x1.0p-52, 0.18] Max error: 0x1.67ea5c184e5d9p-64 + static long double atan_coeffs[] = { + HEX_LDBL(-, b, 3f52e0c278293b3, -, 67), + HEX_LDBL(-, a, aaaaaaaaaaa95b8, -, 5), + HEX_LDBL(+, c, ccccccccc992407, -, 6), + HEX_LDBL(-, 9, 24924923024398, -, 6), + HEX_LDBL(+, e, 38e38d6f92c98f3, -, 7), + HEX_LDBL(-, b, a2e89bfb8393ec6, -, 7), + HEX_LDBL(+, 9, d89a9f574d412cb, -, 7), + HEX_LDBL(-, 8, 88580517884c547, -, 7), + HEX_LDBL(+, f, 0ab6756abdad408, -, 8), + HEX_LDBL(-, d, 56a5b07a2f15b49, -, 8), + HEX_LDBL(+, b, 72ab587e46d80b2, -, 8), + HEX_LDBL(-, 8, 62ea24bb5b2e636, -, 8), + HEX_LDBL(+, e, d67c16582123937, -, 10) + }; // minimax fit over [ 0x1.0p-52, 0.18] Max error: + // 0x1.67ea5c184e5d9p-64 // Calculate y = p(r) - const size_t atan_coeff_count = sizeof( atan_coeffs ) / sizeof( atan_coeffs[0] ); - long double y = atan_coeffs[ atan_coeff_count - 1]; - for( i = (int)atan_coeff_count - 2; i >= 0; i-- ) + const size_t atan_coeff_count = + sizeof(atan_coeffs) / sizeof(atan_coeffs[0]); + long double y = atan_coeffs[atan_coeff_count - 1]; + for (i = (int)atan_coeff_count - 2; i >= 0; i--) y = atan_coeffs[i] + y * z2; - z *= 2.0L; // fold in 2.0 for 2.0 * atan(z) - y *= z; // rp(r) + z *= 2.0L; // fold in 2.0 for 2.0 * atan(z) + y *= z; // rp(r) return head + ((y + tail) + z); } // do |x| <= sqrt(0.5) here - // acos( sqrt(z) ) - PI/2 - // Piecewise minimax polynomial fits for p(z) = 1 + ------------------------; + // acos( sqrt(z) ) - + // PI/2 + // Piecewise minimax polynomial fits for p(z) = 1 + + // ------------------------; // sqrt(z) // // Define z = x*x, and solve for acos(x) over x in x >= 0: @@ -5215,52 +5751,88 @@ long double reference_acosl(long double x) // acos( sqrt(z) ) = acos(x) = x*(p(z)-1) + PI/2 = xp(x**2) - x + PI/2 // const long double coeffs[4][14] = { - { HEX_LDBL( -, a, fa7382e1f347974, -, 10 ), HEX_LDBL( -, b, 4d5a992de1ac4da, -, 6 ), - HEX_LDBL( -, a, c526184bd558c17, -, 7 ), HEX_LDBL( -, d, 9ed9b0346ec092a, -, 8 ), - HEX_LDBL( -, 9, dca410c1f04b1f, -, 8 ), HEX_LDBL( -, f, 76e411ba9581ee5, -, 9 ), - HEX_LDBL( -, c, c71b00479541d8e, -, 9 ), HEX_LDBL( -, a, f527a3f9745c9de, -, 9 ), - HEX_LDBL( -, 9, a93060051f48d14, -, 9 ), HEX_LDBL( -, 8, b3d39ad70e06021, -, 9 ), - HEX_LDBL( -, f, f2ab95ab84f79c, -, 10 ), HEX_LDBL( -, e, d1af5f5301ccfe4, -, 10 ), - HEX_LDBL( -, e, 1b53ba562f0f74a, -, 10 ), HEX_LDBL( -, d, 6a3851330e15526, -, 10 ) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ] Error: 0x1.97839bf07024p-76 + { HEX_LDBL(-, a, fa7382e1f347974, -, 10), + HEX_LDBL(-, b, 4d5a992de1ac4da, -, 6), + HEX_LDBL(-, a, c526184bd558c17, -, 7), + HEX_LDBL(-, d, 9ed9b0346ec092a, -, 8), + HEX_LDBL(-, 9, dca410c1f04b1f, -, 8), + HEX_LDBL(-, f, 76e411ba9581ee5, -, 9), + HEX_LDBL(-, c, c71b00479541d8e, -, 9), + HEX_LDBL(-, a, f527a3f9745c9de, -, 9), + HEX_LDBL(-, 9, a93060051f48d14, -, 9), + HEX_LDBL(-, 8, b3d39ad70e06021, -, 9), + HEX_LDBL(-, f, f2ab95ab84f79c, -, 10), + HEX_LDBL(-, e, d1af5f5301ccfe4, -, 10), + HEX_LDBL(-, e, 1b53ba562f0f74a, -, 10), + HEX_LDBL(-, d, 6a3851330e15526, -, + 10) }, // x - 0.0625 in [ -0x1.fffffffffp-5, 0x1.0p-4 ] + // Error: 0x1.97839bf07024p-76 - { HEX_LDBL( -, 8, c2f1d638e4c1b48, -, 8 ), HEX_LDBL( -, c, d47ac903c311c2c, -, 6 ), - HEX_LDBL( -, d, e020b2dabd5606a, -, 7 ), HEX_LDBL( -, a, 086fafac220f16b, -, 7 ), - HEX_LDBL( -, 8, 55b5efaf6b86c3e, -, 7 ), HEX_LDBL( -, f, 05c9774fed2f571, -, 8 ), - HEX_LDBL( -, e, 484a93f7f0fc772, -, 8 ), HEX_LDBL( -, e, 1a32baef01626e4, -, 8 ), - HEX_LDBL( -, e, 528e525b5c9c73d, -, 8 ), HEX_LDBL( -, e, ddd5d27ad49b2c8, -, 8 ), - HEX_LDBL( -, f, b3259e7ae10c6f, -, 8 ), HEX_LDBL( -, 8, 68998170d5b19b7, -, 7 ), - HEX_LDBL( -, 9, 4468907f007727, -, 7 ), HEX_LDBL( -, a, 2ad5e4906a8e7b3, -, 7 ) },// x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ] Error: 0x1.647af70073457p-73 + { HEX_LDBL(-, 8, c2f1d638e4c1b48, -, 8), + HEX_LDBL(-, c, d47ac903c311c2c, -, 6), + HEX_LDBL(-, d, e020b2dabd5606a, -, 7), + HEX_LDBL(-, a, 086fafac220f16b, -, 7), + HEX_LDBL(-, 8, 55b5efaf6b86c3e, -, 7), + HEX_LDBL(-, f, 05c9774fed2f571, -, 8), + HEX_LDBL(-, e, 484a93f7f0fc772, -, 8), + HEX_LDBL(-, e, 1a32baef01626e4, -, 8), + HEX_LDBL(-, e, 528e525b5c9c73d, -, 8), + HEX_LDBL(-, e, ddd5d27ad49b2c8, -, 8), + HEX_LDBL(-, f, b3259e7ae10c6f, -, 8), + HEX_LDBL(-, 8, 68998170d5b19b7, -, 7), + HEX_LDBL(-, 9, 4468907f007727, -, 7), + HEX_LDBL(-, a, 2ad5e4906a8e7b3, -, + 7) }, // x - 0.1875 in [ -0x1.0p-4, 0x1.0p-4 ] Error: + // 0x1.647af70073457p-73 - { HEX_LDBL( -, f, a76585ad399e7ac, -, 8 ), HEX_LDBL( -, e, d665b7dd504ca7c, -, 6 ), - HEX_LDBL( -, 9, 4c7c2402bd4bc33, -, 6 ), HEX_LDBL( -, f, ba76b69074ff71c, -, 7 ), - HEX_LDBL( -, f, 58117784bdb6d5f, -, 7 ), HEX_LDBL( -, 8, 22ddd8eef53227d, -, 6 ), - HEX_LDBL( -, 9, 1d1d3b57a63cdb4, -, 6 ), HEX_LDBL( -, a, 9c4bdc40cca848, -, 6 ), - HEX_LDBL( -, c, b673b12794edb24, -, 6 ), HEX_LDBL( -, f, 9290a06e31575bf, -, 6 ), - HEX_LDBL( -, 9, b4929c16aeb3d1f, -, 5 ), HEX_LDBL( -, c, 461e725765a7581, -, 5 ), - HEX_LDBL( -, 8, 0a59654c98d9207, -, 4 ), HEX_LDBL( -, a, 6de6cbd96c80562, -, 4 ) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ] Error: 0x1.b0246c304ce1ap-70 + { HEX_LDBL(-, f, a76585ad399e7ac, -, 8), + HEX_LDBL(-, e, d665b7dd504ca7c, -, 6), + HEX_LDBL(-, 9, 4c7c2402bd4bc33, -, 6), + HEX_LDBL(-, f, ba76b69074ff71c, -, 7), + HEX_LDBL(-, f, 58117784bdb6d5f, -, 7), + HEX_LDBL(-, 8, 22ddd8eef53227d, -, 6), + HEX_LDBL(-, 9, 1d1d3b57a63cdb4, -, 6), + HEX_LDBL(-, a, 9c4bdc40cca848, -, 6), + HEX_LDBL(-, c, b673b12794edb24, -, 6), + HEX_LDBL(-, f, 9290a06e31575bf, -, 6), + HEX_LDBL(-, 9, b4929c16aeb3d1f, -, 5), + HEX_LDBL(-, c, 461e725765a7581, -, 5), + HEX_LDBL(-, 8, 0a59654c98d9207, -, 4), + HEX_LDBL(-, a, 6de6cbd96c80562, -, + 4) }, // x - 0.3125 in [ -0x1.0p-4, 0x1.0p-4 ] Error: + // 0x1.b0246c304ce1ap-70 - { HEX_LDBL( -, b, dca8b0359f96342, -, 7 ), HEX_LDBL( -, 8, cd2522fcde9823, -, 5 ), - HEX_LDBL( -, d, 2af9397b27ff74d, -, 6 ), HEX_LDBL( -, d, 723f2c2c2409811, -, 6 ), - HEX_LDBL( -, f, ea8f8481ecc3cd1, -, 6 ), HEX_LDBL( -, a, 43fd8a7a646b0b2, -, 5 ), - HEX_LDBL( -, e, 01b0bf63a4e8d76, -, 5 ), HEX_LDBL( -, 9, f0b7096a2a7b4d, -, 4 ), - HEX_LDBL( -, e, 872e7c5a627ab4c, -, 4 ), HEX_LDBL( -, a, dbd760a1882da48, -, 3 ), - HEX_LDBL( -, 8, 424e4dea31dd273, -, 2 ), HEX_LDBL( -, c, c05d7730963e793, -, 2 ), - HEX_LDBL( -, a, 523d97197cd124a, -, 1 ), HEX_LDBL( -, 8, 307ba943978aaee, +, 0 ) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ] Error: 0x1.9ecff73da69c9p-66 - }; + { HEX_LDBL(-, b, dca8b0359f96342, -, 7), + HEX_LDBL(-, 8, cd2522fcde9823, -, 5), + HEX_LDBL(-, d, 2af9397b27ff74d, -, 6), + HEX_LDBL(-, d, 723f2c2c2409811, -, 6), + HEX_LDBL(-, f, ea8f8481ecc3cd1, -, 6), + HEX_LDBL(-, a, 43fd8a7a646b0b2, -, 5), + HEX_LDBL(-, e, 01b0bf63a4e8d76, -, 5), + HEX_LDBL(-, 9, f0b7096a2a7b4d, -, 4), + HEX_LDBL(-, e, 872e7c5a627ab4c, -, 4), + HEX_LDBL(-, a, dbd760a1882da48, -, 3), + HEX_LDBL(-, 8, 424e4dea31dd273, -, 2), + HEX_LDBL(-, c, c05d7730963e793, -, 2), + HEX_LDBL(-, a, 523d97197cd124a, -, 1), + HEX_LDBL(-, 8, 307ba943978aaee, +, + 0) } // x - 0.4375 in [ -0x1.0p-4, 0x1.0p-4 ] Error: + // 0x1.9ecff73da69c9p-66 + }; const long double offsets[4] = { 0.0625, 0.1875, 0.3125, 0.4375 }; - const size_t coeff_count = sizeof( coeffs[0] ) / sizeof( coeffs[0][0] ); + const size_t coeff_count = sizeof(coeffs[0]) / sizeof(coeffs[0][0]); - // reduce the incoming values a bit so that they are in the range [-0x1.0p-4, 0x1.0p-4] + // reduce the incoming values a bit so that they are in the range + // [-0x1.0p-4, 0x1.0p-4] const long double *c; i = x2 * 8.0L; c = coeffs[i]; - x2 -= offsets[i]; // exact + x2 -= offsets[i]; // exact // calcualte p(x2) - long double y = c[ coeff_count - 1]; - for( i = (int)coeff_count - 2; i >= 0; i-- ) - y = c[i] + y * x2; + long double y = c[coeff_count - 1]; + for (i = (int)coeff_count - 2; i >= 0; i--) y = c[i] + y * x2; // xp(x2) y *= x; @@ -5273,58 +5845,50 @@ double reference_relaxed_acos(double x) { return reference_acos(x); } double reference_log10(double x) { - if( x == 0.0 ) - return -INFINITY; + if (x == 0.0) return -INFINITY; - if( x < 0.0 ) - return cl_make_nan(); + if (x < 0.0) return cl_make_nan(); - if( isinf(x) ) - return INFINITY; + if (isinf(x)) return INFINITY; - double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 ); + double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2); double logxHi, logxLo; __log2_ep(&logxHi, &logxLo, x); - return logxHi*log2Hi; + return logxHi * log2Hi; } double reference_relaxed_log10(double x) { return reference_log10(x); } long double reference_log10l(long double x) { - if( x == 0.0 ) - return -INFINITY; + if (x == 0.0) return -INFINITY; - if( x < 0.0 ) - return cl_make_nan(); + if (x < 0.0) return cl_make_nan(); - if( isinf(x) ) - return INFINITY; + if (isinf(x)) return INFINITY; - double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 ); - double log2Lo = HEX_DBL( +, 1, e623e2566b02d, -, 55 ); + double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2); + double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55); double logxHi, logxLo; __log2_ep(&logxHi, &logxLo, x); - //double rhi, rlo; - //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); - //return (long double) rhi + (long double) rlo; + // double rhi, rlo; + // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); + // return (long double) rhi + (long double) rlo; - long double lg2 = (long double) log2Hi + (long double) log2Lo; - long double logx = (long double) logxHi + (long double) logxLo; - return logx*lg2; + long double lg2 = (long double)log2Hi + (long double)log2Lo; + long double logx = (long double)logxHi + (long double)logxLo; + return logx * lg2; } -double reference_acos(double x) -{ - return acos( x ); -} +double reference_acos(double x) { return acos(x); } double reference_atan2(double x, double y) { #if defined(_WIN32) // fix edge cases for Windows - if (isinf(x) && isinf(y)) { + if (isinf(x) && isinf(y)) + { double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4; return (x > 0) ? retval : -retval; } @@ -5336,7 +5900,8 @@ long double reference_atan2l(long double x, long double y) { #if defined(_WIN32) // fix edge cases for Windows - if (isinf(x) && isinf(y)) { + if (isinf(x) && isinf(y)) + { long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4; return (x > 0) ? retval : -retval; } @@ -5346,7 +5911,7 @@ long double reference_atan2l(long double x, long double y) double reference_frexp(double a, int *exp) { - if(isnan(a) || isinf(a) || a == 0.0) + if (isnan(a) || isinf(a) || a == 0.0) { *exp = 0; return a; @@ -5364,7 +5929,7 @@ double reference_frexp(double a, int *exp) u.l &= 0x7fffffffffffffffULL; int bias = -1022; - if((u.l & 0x7ff0000000000000ULL) == 0) + if ((u.l & 0x7ff0000000000000ULL) == 0) { double d = u.l; u.d = d; @@ -5383,13 +5948,13 @@ double reference_frexp(double a, int *exp) long double reference_frexpl(long double a, int *exp) { - if(isnan(a) || isinf(a) || a == 0.0) + if (isnan(a) || isinf(a) || a == 0.0) { *exp = 0; return a; } - if(sizeof(long double) == sizeof(double)) + if (sizeof(long double) == sizeof(double)) { return reference_frexp(a, exp); } @@ -5400,92 +5965,64 @@ long double reference_frexpl(long double a, int *exp) } -double reference_atan(double x) -{ - return atan( x ); -} +double reference_atan(double x) { return atan(x); } -long double reference_atanl(long double x) -{ - return atanl( x ); -} +long double reference_atanl(long double x) { return atanl(x); } -long double reference_asinl(long double x) -{ - return asinl( x ); -} +long double reference_asinl(long double x) { return asinl(x); } -double reference_asin(double x) -{ - return asin( x ); -} +double reference_asin(double x) { return asin(x); } double reference_relaxed_asin(double x) { return reference_asin(x); } -double reference_fabs(double x) -{ - return fabs( x); -} +double reference_fabs(double x) { return fabs(x); } -double reference_cosh(double x) -{ - return cosh( x ); -} +double reference_cosh(double x) { return cosh(x); } long double reference_sqrtl(long double x) { -#if defined( __SSE2__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) - __m128d result128 = _mm_set_sd((double) x); +#if defined(__SSE2__) \ + || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) + __m128d result128 = _mm_set_sd((double)x); result128 = _mm_sqrt_sd(result128, result128); return _mm_cvtsd_f64(result128); #else volatile double dx = x; - return sqrt( dx ); + return sqrt(dx); #endif } -long double reference_tanhl(long double x) -{ - return tanhl( x ); -} +long double reference_tanhl(long double x) { return tanhl(x); } long double reference_floorl(long double x) { - if( x == 0.0 || reference_isinfl(x) || reference_isnanl(x) ) - return x; + if (x == 0.0 || reference_isinfl(x) || reference_isnanl(x)) return x; long double absx = reference_fabsl(x); - if( absx >= HEX_LDBL( +, 1, 0, +, 52 ) ) - return x; + if (absx >= HEX_LDBL(+, 1, 0, +, 52)) return x; - if( absx < 1.0 ) + if (absx < 1.0) { - if( x < 0.0 ) + if (x < 0.0) return -1.0; else return 0.0; } - long double r = (long double) ((cl_long) x); + long double r = (long double)((cl_long)x); - if( x < 0.0 && r > x ) - r -= 1.0; + if (x < 0.0 && r > x) r -= 1.0; return r; } -double reference_tanh(double x) -{ - return tanh( x ); -} +double reference_tanh(double x) { return tanh(x); } -long double reference_assignmentl( long double x ){ return x; } +long double reference_assignmentl(long double x) { return x; } -int reference_notl( long double x ) +int reference_notl(long double x) { int r = !x; return r; } - - diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h index 7c751f68..78b24510 100644 --- a/test_conformance/math_brute_force/reference_math.h +++ b/test_conformance/math_brute_force/reference_math.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -16,223 +16,221 @@ #ifndef REFERENCE_MATH_H #define REFERENCE_MATH_H -#if defined( __APPLE__ ) - #include +#if defined(__APPLE__) +#include #else - #include +#include #endif // -- for testing float -- -double reference_sinh( double x ); -double reference_sqrt( double x ); -double reference_tanh( double x ); -double reference_acos( double ); -double reference_asin( double ); -double reference_atan( double ); -double reference_atan2( double, double ); -double reference_ceil( double ); -double reference_cosh( double ); -double reference_exp( double ); -double reference_fabs( double ); -double reference_acospi( double ); -double reference_asinpi( double ); -double reference_atanpi( double ); -double reference_atan2pi( double, double ); -double reference_cospi( double ); -double reference_divide( double, double ); -double reference_fract( double, double * ); -float reference_fma( float, float, float, int ); -double reference_mad( double, double, double ); -double reference_nextafter(double, double ); -double reference_recip( double ); -double reference_rootn( double, int ); -double reference_rsqrt( double ); -double reference_sincos( double, double * ); -double reference_sinpi( double ); -double reference_tanpi( double ); +double reference_sinh(double x); +double reference_sqrt(double x); +double reference_tanh(double x); +double reference_acos(double); +double reference_asin(double); +double reference_atan(double); +double reference_atan2(double, double); +double reference_ceil(double); +double reference_cosh(double); +double reference_exp(double); +double reference_fabs(double); +double reference_acospi(double); +double reference_asinpi(double); +double reference_atanpi(double); +double reference_atan2pi(double, double); +double reference_cospi(double); +double reference_divide(double, double); +double reference_fract(double, double*); +float reference_fma(float, float, float, int); +double reference_mad(double, double, double); +double reference_nextafter(double, double); +double reference_recip(double); +double reference_rootn(double, int); +double reference_rsqrt(double); +double reference_sincos(double, double*); +double reference_sinpi(double); +double reference_tanpi(double); double reference_pow(double x, double y); -double reference_pown( double, int ); -double reference_powr( double, double ); -double reference_cos( double ); -double reference_sin( double ); -double reference_tan( double ); -double reference_log( double ); -double reference_log10( double ); -double reference_modf( double, double *n ); +double reference_pown(double, int); +double reference_powr(double, double); +double reference_cos(double); +double reference_sin(double); +double reference_tan(double); +double reference_log(double); +double reference_log10(double); +double reference_modf(double, double* n); -double reference_fdim( double, double ); -double reference_add( double, double ); -double reference_subtract( double, double ); -double reference_divide( double, double ); -double reference_multiply( double, double ); -double reference_remquo( double, double, int* ); -double reference_lgamma_r( double, int* ); +double reference_fdim(double, double); +double reference_add(double, double); +double reference_subtract(double, double); +double reference_divide(double, double); +double reference_multiply(double, double); +double reference_remquo(double, double, int*); +double reference_lgamma_r(double, int*); -int reference_isequal( double, double ); -int reference_isfinite( double ); -int reference_isgreater( double, double ); -int reference_isgreaterequal( double, double ); -int reference_isinf( double ); -int reference_isless( double, double ); -int reference_islessequal( double, double ); -int reference_islessgreater( double, double ); -int reference_isnan( double ); -int reference_isnormal( double ); -int reference_isnotequal( double, double ); -int reference_isordered( double, double ); -int reference_isunordered( double, double ); -int reference_signbit( float ); +int reference_isequal(double, double); +int reference_isfinite(double); +int reference_isgreater(double, double); +int reference_isgreaterequal(double, double); +int reference_isinf(double); +int reference_isless(double, double); +int reference_islessequal(double, double); +int reference_islessgreater(double, double); +int reference_isnan(double); +int reference_isnormal(double); +int reference_isnotequal(double, double); +int reference_isordered(double, double); +int reference_isunordered(double, double); +int reference_signbit(float); -double reference_acosh( double x ); -double reference_asinh( double x ); -double reference_atanh( double x ); +double reference_acosh(double x); +double reference_asinh(double x); +double reference_atanh(double x); double reference_cbrt(double x); -float reference_copysign( float x, float y); -double reference_copysignd( double x, double y); -double reference_exp10( double ); -double reference_exp2( double x ); -double reference_expm1( double x ); -double reference_fmax( double x, double y ); -double reference_fmin( double x, double y ); -double reference_hypot( double x, double y ); -double reference_lgamma( double x); -int reference_ilogb( double ); -double reference_log2( double x ); -double reference_log1p( double x ); -double reference_logb( double x ); -double reference_maxmag( double x, double y ); -double reference_minmag( double x, double y ); -double reference_nan( cl_uint x ); -double reference_reciprocal( double x ); -double reference_remainder( double x, double y ); -double reference_rint( double x ); -double reference_round( double x ); -double reference_trunc( double x ); -double reference_floor( double x ); -double reference_fmod( double x, double y ); -double reference_frexp( double x, int *n ); -double reference_ldexp( double x, int n ); +float reference_copysign(float x, float y); +double reference_copysignd(double x, double y); +double reference_exp10(double); +double reference_exp2(double x); +double reference_expm1(double x); +double reference_fmax(double x, double y); +double reference_fmin(double x, double y); +double reference_hypot(double x, double y); +double reference_lgamma(double x); +int reference_ilogb(double); +double reference_log2(double x); +double reference_log1p(double x); +double reference_logb(double x); +double reference_maxmag(double x, double y); +double reference_minmag(double x, double y); +double reference_nan(cl_uint x); +double reference_reciprocal(double x); +double reference_remainder(double x, double y); +double reference_rint(double x); +double reference_round(double x); +double reference_trunc(double x); +double reference_floor(double x); +double reference_fmod(double x, double y); +double reference_frexp(double x, int* n); +double reference_ldexp(double x, int n); -double reference_assignment( double x ); -int reference_not( double x ); +double reference_assignment(double x); +int reference_not(double x); // -- for testing fast-relaxed double reference_relaxed_acos(double); double reference_relaxed_asin(double); double reference_relaxed_atan(double); -double reference_relaxed_mad( double, double, double ); -double reference_relaxed_divide( double x, double y ); -double reference_relaxed_sin( double x ); +double reference_relaxed_mad(double, double, double); +double reference_relaxed_divide(double x, double y); +double reference_relaxed_sin(double x); double reference_relaxed_sinpi(double x); -double reference_relaxed_cos( double x ); +double reference_relaxed_cos(double x); double reference_relaxed_cospi(double x); -double reference_relaxed_sincos( double x, double * y); -double reference_relaxed_tan( double x ); -double reference_relaxed_exp( double x ); -double reference_relaxed_exp2( double x ); -double reference_relaxed_exp10( double x ); -double reference_relaxed_log( double x ); -double reference_relaxed_log2( double x ); +double reference_relaxed_sincos(double x, double* y); +double reference_relaxed_tan(double x); +double reference_relaxed_exp(double x); +double reference_relaxed_exp2(double x); +double reference_relaxed_exp10(double x); +double reference_relaxed_log(double x); +double reference_relaxed_log2(double x); double reference_relaxed_log10(double x); -double reference_relaxed_pow( double x, double y); -double reference_relaxed_reciprocal( double x ); +double reference_relaxed_pow(double x, double y); +double reference_relaxed_reciprocal(double x); // -- for testing double -- -long double reference_sinhl( long double x ); -long double reference_sqrtl( long double x ); -long double reference_tanhl( long double x ); -long double reference_acosl( long double ); -long double reference_asinl( long double ); -long double reference_atanl( long double ); -long double reference_atan2l( long double, long double ); -long double reference_ceill( long double ); -long double reference_coshl( long double ); -long double reference_expl( long double ); -long double reference_fabsl( long double ); -long double reference_acospil( long double ); -long double reference_asinpil( long double ); -long double reference_atanpil( long double ); -long double reference_atan2pil( long double, long double ); -long double reference_cospil( long double ); -long double reference_dividel( long double, long double ); -long double reference_fractl( long double, long double * ); -long double reference_fmal( long double, long double, long double ); -long double reference_madl( long double, long double, long double ); -long double reference_nextafterl(long double, long double ); -long double reference_recipl( long double ); -long double reference_rootnl( long double, int ); -long double reference_rsqrtl( long double ); -long double reference_sincosl( long double, long double * ); -long double reference_sinpil( long double ); -long double reference_tanpil( long double ); +long double reference_sinhl(long double x); +long double reference_sqrtl(long double x); +long double reference_tanhl(long double x); +long double reference_acosl(long double); +long double reference_asinl(long double); +long double reference_atanl(long double); +long double reference_atan2l(long double, long double); +long double reference_ceill(long double); +long double reference_coshl(long double); +long double reference_expl(long double); +long double reference_fabsl(long double); +long double reference_acospil(long double); +long double reference_asinpil(long double); +long double reference_atanpil(long double); +long double reference_atan2pil(long double, long double); +long double reference_cospil(long double); +long double reference_dividel(long double, long double); +long double reference_fractl(long double, long double*); +long double reference_fmal(long double, long double, long double); +long double reference_madl(long double, long double, long double); +long double reference_nextafterl(long double, long double); +long double reference_recipl(long double); +long double reference_rootnl(long double, int); +long double reference_rsqrtl(long double); +long double reference_sincosl(long double, long double*); +long double reference_sinpil(long double); +long double reference_tanpil(long double); long double reference_powl(long double x, long double y); -long double reference_pownl( long double, int ); -long double reference_powrl( long double, long double ); -long double reference_cosl( long double ); -long double reference_sinl(long double ); -long double reference_tanl( long double ); -long double reference_logl( long double ); -long double reference_log10l( long double ); -long double reference_modfl( long double, long double *n ); +long double reference_pownl(long double, int); +long double reference_powrl(long double, long double); +long double reference_cosl(long double); +long double reference_sinl(long double); +long double reference_tanl(long double); +long double reference_logl(long double); +long double reference_log10l(long double); +long double reference_modfl(long double, long double* n); -long double reference_fdiml( long double, long double ); -long double reference_addl( long double, long double ); -long double reference_subtractl( long double, long double ); -long double reference_dividel( long double, long double ); -long double reference_multiplyl( long double, long double ); -long double reference_remquol( long double, long double, int* ); -long double reference_lgamma_rl( long double, int* ); +long double reference_fdiml(long double, long double); +long double reference_addl(long double, long double); +long double reference_subtractl(long double, long double); +long double reference_dividel(long double, long double); +long double reference_multiplyl(long double, long double); +long double reference_remquol(long double, long double, int*); +long double reference_lgamma_rl(long double, int*); -int reference_isequall( long double, long double ); -int reference_isfinitel( long double ); -int reference_isgreaterl( long double, long double ); -int reference_isgreaterequall( long double, long double ); -int reference_isinfl( long double ); -int reference_islessl( long double, long double ); -int reference_islessequall( long double, long double ); -int reference_islessgreaterl( long double, long double ); -int reference_isnanl( long double ); -int reference_isnormall( long double ); -int reference_isnotequall( long double, long double ); -int reference_isorderedl( long double, long double ); -int reference_isunorderedl( long double, long double ); -int reference_signbitl( long double ); +int reference_isequall(long double, long double); +int reference_isfinitel(long double); +int reference_isgreaterl(long double, long double); +int reference_isgreaterequall(long double, long double); +int reference_isinfl(long double); +int reference_islessl(long double, long double); +int reference_islessequall(long double, long double); +int reference_islessgreaterl(long double, long double); +int reference_isnanl(long double); +int reference_isnormall(long double); +int reference_isnotequall(long double, long double); +int reference_isorderedl(long double, long double); +int reference_isunorderedl(long double, long double); +int reference_signbitl(long double); -long double reference_acoshl( long double x ); -long double reference_asinhl( long double x ); -long double reference_atanhl( long double x ); +long double reference_acoshl(long double x); +long double reference_asinhl(long double x); +long double reference_atanhl(long double x); long double reference_cbrtl(long double x); -long double reference_copysignl( long double x, long double y); -long double reference_exp10l( long double ); -long double reference_exp2l( long double x ); -long double reference_expm1l( long double x ); -long double reference_fmaxl( long double x, long double y ); -long double reference_fminl( long double x, long double y ); -long double reference_hypotl( long double x, long double y ); -long double reference_lgammal( long double x); -int reference_ilogbl( long double ); -long double reference_log2l( long double x ); -long double reference_log1pl( long double x ); -long double reference_logbl( long double x ); -long double reference_maxmagl( long double x, long double y ); -long double reference_minmagl( long double x, long double y ); -long double reference_nanl( cl_ulong x ); -long double reference_reciprocall( long double x ); -long double reference_remainderl( long double x, long double y ); -long double reference_rintl( long double x ); -long double reference_roundl( long double x ); -long double reference_truncl( long double x ); -long double reference_floorl( long double x ); -long double reference_fmodl( long double x, long double y ); -long double reference_frexpl( long double x, int *n ); -long double reference_ldexpl( long double x, int n ); +long double reference_copysignl(long double x, long double y); +long double reference_exp10l(long double); +long double reference_exp2l(long double x); +long double reference_expm1l(long double x); +long double reference_fmaxl(long double x, long double y); +long double reference_fminl(long double x, long double y); +long double reference_hypotl(long double x, long double y); +long double reference_lgammal(long double x); +int reference_ilogbl(long double); +long double reference_log2l(long double x); +long double reference_log1pl(long double x); +long double reference_logbl(long double x); +long double reference_maxmagl(long double x, long double y); +long double reference_minmagl(long double x, long double y); +long double reference_nanl(cl_ulong x); +long double reference_reciprocall(long double x); +long double reference_remainderl(long double x, long double y); +long double reference_rintl(long double x); +long double reference_roundl(long double x); +long double reference_truncl(long double x); +long double reference_floorl(long double x); +long double reference_fmodl(long double x, long double y); +long double reference_frexpl(long double x, int* n); +long double reference_ldexpl(long double x, int n); -long double reference_assignmentl( long double x ); -int reference_notl( long double x ); +long double reference_assignmentl(long double x); +int reference_notl(long double x); #endif - - diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp index fd97a95d..448a7c3d 100644 --- a/test_conformance/math_brute_force/ternary.cpp +++ b/test_conformance/math_brute_force/ternary.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -35,15 +35,29 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in1, __global float", sizeNames[vectorSize], "* in2, __global float", sizeNames[vectorSize], "* in3 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i], in3[i] );\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2, __global float", + sizeNames[vectorSize], + "* in3 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], in3[i] );\n" + "}\n" }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in, __global float* in2 , __global float* in3)\n" + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in, __global float* in2 , " + "__global float* in3)\n" "{\n" " size_t i = get_global_id(0);\n" " if( i + 1 < get_global_size(0) )\n" @@ -51,12 +65,16 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, " float3 f0 = vload3( 0, in + 3 * i );\n" " float3 f1 = vload3( 0, in2 + 3 * i );\n" " float3 f2 = vload3( 0, in3 + 3 * i );\n" - " f0 = ", name, "( f0, f1, f2 );\n" + " f0 = ", + name, + "( f0, f1, f2 );\n" " vstore3( f0, 0, out + 3*i );\n" " }\n" " else\n" " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" " float3 f0, f1, f2;\n" " switch( parity )\n" " {\n" @@ -71,7 +89,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n" " break;\n" " }\n" - " f0 = ", name, "( f0, f1, f2 );\n" + " f0 = ", + name, + "( f0, f1, f2 );\n" " switch( parity )\n" " {\n" " case 0:\n" @@ -86,16 +106,17 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } @@ -103,17 +124,31 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in1, __global double", sizeNames[vectorSize], "* in2, __global double", sizeNames[vectorSize], "* in3 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in1[i], in2[i], in3[i] );\n" - "}\n" - }; + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2, __global double", + sizeNames[vectorSize], + "* in3 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], in3[i] );\n" + "}\n" }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in, __global double* in2 , __global double* in3)\n" + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global double* in2 , " + "__global double* in3)\n" "{\n" " size_t i = get_global_id(0);\n" " if( i + 1 < get_global_size(0) )\n" @@ -121,12 +156,16 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, " double3 d0 = vload3( 0, in + 3 * i );\n" " double3 d1 = vload3( 0, in2 + 3 * i );\n" " double3 d2 = vload3( 0, in3 + 3 * i );\n" - " d0 = ", name, "( d0, d1, d2 );\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" " vstore3( d0, 0, out + 3*i );\n" " }\n" " else\n" " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" " double3 d0, d1, d2;\n" " switch( parity )\n" " {\n" @@ -141,7 +180,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, " d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n" " break;\n" " }\n" - " d0 = ", name, "( d0, d1, d2 );\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" " switch( parity )\n" " {\n" " case 0:\n" @@ -156,42 +197,47 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); @@ -200,18 +246,85 @@ static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, vo // A table of more difficult cases to get right static const float specialValuesFloat[] = { - -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), - -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.75f, -1.5f, -1.25f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24), -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), - MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), - MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f, + -NAN, + -INFINITY, + -FLT_MAX, + MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), + MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), + MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), + MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), + MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), + MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), + -3.0f, + MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), + -2.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), + -2.0f, + MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), + -1.75f, + -1.5f, + -1.25f, + MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), + MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), + MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24), + -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), + -1.0f, + MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), + MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), + -FLT_MIN, + MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), + MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), + MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), + MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), + MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), + MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), + MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), + MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), + MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), + MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), + -0.0f, - +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), - +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.75f, 1.5f, 1.25f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24), +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), - MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), - MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f + +NAN, + +INFINITY, + +FLT_MAX, + MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), + MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), + MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), + MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), + MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), + MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), + +3.0f, + MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), + 2.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), + +2.0f, + MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), + 1.75f, + 1.5f, + 1.25f, + MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), + MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), + MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24), + +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24), + +1.0f, + MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), + MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), + +FLT_MIN, + MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), + MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), + MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), + MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), + MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), + MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), + MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), + MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), + MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), + MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), + +0.0f }; -static size_t specialValuesFloatCount = sizeof( specialValuesFloat ) / sizeof( specialValuesFloat[0] ); +static size_t specialValuesFloatCount = + sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) @@ -219,23 +332,23 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); float maxErrorVal = 0.0f; float maxErrorVal2 = 0.0f; float maxErrorVal3 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(float), bufferSize); - int skipNanInf = (0 == strcmp( "fma", f->nameInCode )) && ! gInfNanSupport; - cl_uchar overflow[BUFFER_SIZE / sizeof( float )]; + int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport; + cl_uchar overflow[BUFFER_SIZE / sizeof(float)]; float float_ulps; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - if( gIsEmbedded ) + if (gIsEmbedded) float_ulps = f->float_embedded_ulps; else float_ulps = f->float_ulps; @@ -243,469 +356,570 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) return error; /* for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; + if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + + i) ) ) return error; */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; uint32_t *p3 = (uint32_t *)gIn3; j = 0; - if( i == 0 ) + if (i == 0) { // test edge cases float *fp = (float *)gIn; float *fp2 = (float *)gIn2; float *fp3 = (float *)gIn3; - uint32_t x, y, z; x = y = z = 0; - for( ; j < bufferSize / sizeof( float ); j++ ) + uint32_t x, y, z; + x = y = z = 0; + for (; j < bufferSize / sizeof(float); j++) { fp[j] = specialValuesFloat[x]; fp2[j] = specialValuesFloat[y]; fp3[j] = specialValuesFloat[z]; - if( ++x >= specialValuesFloatCount ) + if (++x >= specialValuesFloatCount) { x = 0; - if( ++y >= specialValuesFloatCount ) + if (++y >= specialValuesFloatCount) { y = 0; - if( ++z >= specialValuesFloatCount ) - break; + if (++z >= specialValuesFloatCount) break; } } } - if( j == bufferSize / sizeof( float ) ) - vlog_error( "Test Error: not all special cases tested!\n" ); + if (j == bufferSize / sizeof(float)) + vlog_error("Test Error: not all special cases tested!\n"); } - for( ; j < bufferSize / sizeof( float ); j++ ) + for (; j < bufferSize / sizeof(float); j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); p3[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result float *r = (float *)gOut_Ref; float *s = (float *)gIn; float *s2 = (float *)gIn2; float *s3 = (float *)gIn3; - if( skipNanInf ) + if (skipNanInf) { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { feclearexcept(FE_OVERFLOW); - r[j] = (float) f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED ); - overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + r[j] = + (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED); + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); } } else { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - r[j] = (float) f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED ); + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = + (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED); } // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint32_t *t = (uint32_t *)gOut_Ref; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = (uint32_t *)(gOut[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { float err; int fail; - float test = ((float*) q)[j]; - float correct = f->func.f_fma( s[j], s2[j], s3[j], CORRECTLY_ROUNDED ); + float test = ((float *)q)[j]; + float correct = + f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if( skipNanInf ) + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow + if (skipNanInf) { - if( overflow[j] || - IsFloatInfinity(correct) || IsFloatNaN(correct) || - IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) || - IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) || - IsFloatInfinity(s3[j]) || IsFloatNaN(s3[j]) ) + if (overflow[j] || IsFloatInfinity(correct) + || IsFloatNaN(correct) || IsFloatInfinity(s[j]) + || IsFloatNaN(s[j]) || IsFloatInfinity(s2[j]) + || IsFloatNaN(s2[j]) || IsFloatInfinity(s3[j]) + || IsFloatNaN(s3[j])) continue; } - err = Ulp_Error( test, correct ); - fail = ! (fabsf(err) <= float_ulps); + err = Ulp_Error(test, correct); + fail = !(fabsf(err) <= float_ulps); - if( fail && ftz ) + if (fail && ftz) { float correct2, err2; // retry per section 6.5.3.2 with flushing on - if( 0.0f == test && 0.0f == f->func.f_fma( s[j], s2[j], s3[j], FLUSHED ) ) + if (0.0f == test + && 0.0f + == f->func.f_fma(s[j], s2[j], s3[j], FLUSHED)) { fail = 0; err = 0.0f; } // retry per section 6.5.3.3 - if( fail && IsFloatSubnormal( s[j] ) ) + if (fail && IsFloatSubnormal(s[j])) { // look at me, float err3, correct3; - if( skipNanInf ) - feclearexcept( FE_OVERFLOW ); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = f->func.f_fma( 0.0f, s2[j], s3[j], CORRECTLY_ROUNDED ); - correct3 = f->func.f_fma( -0.0f, s2[j], s3[j], CORRECTLY_ROUNDED ); + correct2 = f->func.f_fma(0.0f, s2[j], s3[j], + CORRECTLY_ROUNDED); + correct3 = f->func.f_fma(-0.0f, s2[j], s3[j], + CORRECTLY_ROUNDED); - if( skipNanInf ) + if (skipNanInf) { - if( fetestexcept( FE_OVERFLOW ) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= float_ulps)) + && (!(fabsf(err3) <= float_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( 0.0f == test && - ( 0.0f == f->func.f_fma( 0.0f, s2[j], s3[j], FLUSHED ) || - 0.0f == f->func.f_fma( -0.0f, s2[j], s3[j], FLUSHED ) ) - ) + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, s2[j], s3[j], + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, s2[j], s3[j], + FLUSHED))) { fail = 0; err = 0.0f; } - //try with first two args as zero - if( IsFloatSubnormal( s2[j] ) ) + // try with first two args as zero + if (IsFloatSubnormal(s2[j])) { // its fun to have fun, double correct4, correct5; float err4, err5; - if( skipNanInf ) - feclearexcept( FE_OVERFLOW ); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = f->func.f_fma( 0.0f, 0.0f, s3[j], CORRECTLY_ROUNDED ); - correct3 = f->func.f_fma( -0.0f, 0.0f, s3[j], CORRECTLY_ROUNDED ); - correct4 = f->func.f_fma( 0.0f, -0.0f, s3[j], CORRECTLY_ROUNDED ); - correct5 = f->func.f_fma( -0.0f, -0.0f, s3[j], CORRECTLY_ROUNDED ); + correct2 = f->func.f_fma(0.0f, 0.0f, s3[j], + CORRECTLY_ROUNDED); + correct3 = f->func.f_fma(-0.0f, 0.0f, s3[j], + CORRECTLY_ROUNDED); + correct4 = f->func.f_fma(0.0f, -0.0f, s3[j], + CORRECTLY_ROUNDED); + correct5 = f->func.f_fma(-0.0f, -0.0f, s3[j], + CORRECTLY_ROUNDED); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if( !gInfNanSupport ) + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) { - if( fetestexcept(FE_OVERFLOW) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) || - IsFloatInfinity(correct4) || IsFloatNaN(correct4) || - IsFloatInfinity(correct5) || IsFloatNaN(correct5) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - err4 = Ulp_Error( test, correct4 ); - err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) && - (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + err4 = Ulp_Error(test, correct4); + err5 = Ulp_Error(test, correct5); + fail = fail + && ((!(fabsf(err2) <= float_ulps)) + && (!(fabsf(err3) <= float_ulps)) + && (!(fabsf(err4) <= float_ulps)) + && (!(fabsf(err5) <= float_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( 0.0f == test && - ( 0.0f == f->func.f_fma( 0.0f, 0.0f, s3[j], FLUSHED ) || - 0.0f == f->func.f_fma( -0.0f, 0.0f, s3[j], FLUSHED ) || - 0.0f == f->func.f_fma( 0.0f, -0.0f, s3[j], FLUSHED ) || - 0.0f == f->func.f_fma( -0.0f, -0.0f, s3[j], FLUSHED ) ) - ) + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, 0.0f, s3[j], + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, 0.0f, s3[j], + FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, -0.0f, s3[j], + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, -0.0f, + s3[j], FLUSHED))) { fail = 0; err = 0.0f; } - if( IsFloatSubnormal( s3[j] ) ) + if (IsFloatSubnormal(s3[j])) { - if( test == 0.0f ) // 0*0+0 is 0 + if (test == 0.0f) // 0*0+0 is 0 { fail = 0; err = 0.0f; } } } - else if( IsFloatSubnormal( s3[j] ) ) + else if (IsFloatSubnormal(s3[j])) { double correct4, correct5; float err4, err5; - if( skipNanInf ) - feclearexcept( FE_OVERFLOW ); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = f->func.f_fma( 0.0f, s2[j], 0.0f, CORRECTLY_ROUNDED ); - correct3 = f->func.f_fma( -0.0f, s2[j], 0.0f, CORRECTLY_ROUNDED ); - correct4 = f->func.f_fma( 0.0f, s2[j], -0.0f, CORRECTLY_ROUNDED ); - correct5 = f->func.f_fma( -0.0f, s2[j], -0.0f, CORRECTLY_ROUNDED ); + correct2 = f->func.f_fma(0.0f, s2[j], 0.0f, + CORRECTLY_ROUNDED); + correct3 = f->func.f_fma(-0.0f, s2[j], 0.0f, + CORRECTLY_ROUNDED); + correct4 = f->func.f_fma(0.0f, s2[j], -0.0f, + CORRECTLY_ROUNDED); + correct5 = f->func.f_fma(-0.0f, s2[j], -0.0f, + CORRECTLY_ROUNDED); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if( !gInfNanSupport ) + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) { - if( fetestexcept(FE_OVERFLOW) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) || - IsFloatInfinity(correct4) || IsFloatNaN(correct4) || - IsFloatInfinity(correct5) || IsFloatNaN(correct5) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - err4 = Ulp_Error( test, correct4 ); - err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) && - (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + err4 = Ulp_Error(test, correct4); + err5 = Ulp_Error(test, correct5); + fail = fail + && ((!(fabsf(err2) <= float_ulps)) + && (!(fabsf(err3) <= float_ulps)) + && (!(fabsf(err4) <= float_ulps)) + && (!(fabsf(err5) <= float_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( 0.0f == test && - ( 0.0f == f->func.f_fma( 0.0f, s2[j], 0.0f, FLUSHED ) || - 0.0f == f->func.f_fma(-0.0f, s2[j], 0.0f, FLUSHED ) || - 0.0f == f->func.f_fma( 0.0f, s2[j],-0.0f, FLUSHED ) || - 0.0f == f->func.f_fma(-0.0f, s2[j],-0.0f, FLUSHED ) ) - ) + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, s2[j], 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, s2[j], 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, s2[j], -0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, s2[j], + -0.0f, FLUSHED))) { fail = 0; err = 0.0f; } } } - else if( fail && IsFloatSubnormal( s2[j] ) ) + else if (fail && IsFloatSubnormal(s2[j])) { double correct2, correct3; float err2, err3; - if( skipNanInf ) - feclearexcept( FE_OVERFLOW ); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = f->func.f_fma( s[j], 0.0f, s3[j], CORRECTLY_ROUNDED ); - correct3 = f->func.f_fma( s[j], -0.0f, s3[j], CORRECTLY_ROUNDED ); + correct2 = f->func.f_fma(s[j], 0.0f, s3[j], + CORRECTLY_ROUNDED); + correct3 = f->func.f_fma(s[j], -0.0f, s3[j], + CORRECTLY_ROUNDED); - if( skipNanInf ) + if (skipNanInf) { - if( fetestexcept( FE_OVERFLOW ) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= float_ulps)) + && (!(fabsf(err3) <= float_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( 0.0f == test && - ( 0.0f == f->func.f_fma( s[j], 0.0f, s3[j], FLUSHED ) || - 0.0f == f->func.f_fma( s[j], -0.0f, s3[j], FLUSHED ) ) - ) + if (0.0f == test + && (0.0f + == f->func.f_fma(s[j], 0.0f, s3[j], + FLUSHED) + || 0.0f + == f->func.f_fma(s[j], -0.0f, s3[j], + FLUSHED))) { fail = 0; err = 0.0f; } - //try with second two args as zero - if( IsFloatSubnormal( s3[j] ) ) + // try with second two args as zero + if (IsFloatSubnormal(s3[j])) { double correct4, correct5; float err4, err5; - if( skipNanInf ) - feclearexcept( FE_OVERFLOW ); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = f->func.f_fma( s[j], 0.0f, 0.0f, CORRECTLY_ROUNDED ); - correct3 = f->func.f_fma( s[j], -0.0f, 0.0f, CORRECTLY_ROUNDED ); - correct4 = f->func.f_fma( s[j], 0.0f, -0.0f, CORRECTLY_ROUNDED ); - correct5 = f->func.f_fma( s[j], -0.0f, -0.0f, CORRECTLY_ROUNDED ); + correct2 = f->func.f_fma(s[j], 0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct3 = f->func.f_fma(s[j], -0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct4 = f->func.f_fma(s[j], 0.0f, -0.0f, + CORRECTLY_ROUNDED); + correct5 = f->func.f_fma(s[j], -0.0f, -0.0f, + CORRECTLY_ROUNDED); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if( !gInfNanSupport ) + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) { - if( fetestexcept(FE_OVERFLOW) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) || - IsFloatInfinity(correct4) || IsFloatNaN(correct4) || - IsFloatInfinity(correct5) || IsFloatNaN(correct5) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - err4 = Ulp_Error( test, correct4 ); - err5 = Ulp_Error( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps)) && - (!(fabsf(err4) <= float_ulps)) && (!(fabsf(err5) <= float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + err4 = Ulp_Error(test, correct4); + err5 = Ulp_Error(test, correct5); + fail = fail + && ((!(fabsf(err2) <= float_ulps)) + && (!(fabsf(err3) <= float_ulps)) + && (!(fabsf(err4) <= float_ulps)) + && (!(fabsf(err5) <= float_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( 0.0f == test && - ( 0.0f == f->func.f_fma( s[j], 0.0f, 0.0f, FLUSHED ) || - 0.0f == f->func.f_fma( s[j],-0.0f, 0.0f, FLUSHED ) || - 0.0f == f->func.f_fma( s[j], 0.0f,-0.0f, FLUSHED ) || - 0.0f == f->func.f_fma( s[j],-0.0f,-0.0f, FLUSHED ) ) - ) + if (0.0f == test + && (0.0f + == f->func.f_fma(s[j], 0.0f, 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(s[j], -0.0f, 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(s[j], 0.0f, -0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(s[j], -0.0f, -0.0f, + FLUSHED))) { fail = 0; err = 0.0f; } } } - else if( fail && IsFloatSubnormal(s3[j]) ) + else if (fail && IsFloatSubnormal(s3[j])) { double correct2, correct3; float err2, err3; - if( skipNanInf ) - feclearexcept( FE_OVERFLOW ); + if (skipNanInf) feclearexcept(FE_OVERFLOW); - correct2 = f->func.f_fma( s[j], s2[j], 0.0f, CORRECTLY_ROUNDED ); - correct3 = f->func.f_fma( s[j], s2[j], -0.0f, CORRECTLY_ROUNDED ); + correct2 = f->func.f_fma(s[j], s2[j], 0.0f, + CORRECTLY_ROUNDED); + correct3 = f->func.f_fma(s[j], s2[j], -0.0f, + CORRECTLY_ROUNDED); - if( skipNanInf ) + if (skipNanInf) { - if( fetestexcept( FE_OVERFLOW ) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct2) || IsFloatNaN(correct2) || - IsFloatInfinity(correct3) || IsFloatNaN(correct3) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) continue; } - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= float_ulps)) && (!(fabsf(err3) <= float_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= float_ulps)) + && (!(fabsf(err3) <= float_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( 0.0f == test && - ( 0.0f == f->func.f_fma( s[j], s2[j], 0.0f, FLUSHED ) || - 0.0f == f->func.f_fma( s[j], s2[j],-0.0f, FLUSHED ) ) - ) + if (0.0f == test + && (0.0f + == f->func.f_fma(s[j], s2[j], 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(s[j], s2[j], -0.0f, + FLUSHED))) { fail = 0; err = 0.0f; @@ -713,7 +927,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; @@ -721,9 +935,14 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) maxErrorVal3 = s3[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at {%a, %a, %a} ({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((cl_uint*)s)[j], ((cl_uint*)s2)[j], ((cl_uint*)s3)[j], ((float*) gOut_Ref)[j], test ); + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a, %a, %a} " + "({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n", + f->name, sizeNames[k], err, s[j], s2[j], s3[j], + ((cl_uint *)s)[j], ((cl_uint *)s2)[j], + ((cl_uint *)s3)[j], ((float *)gOut_Ref)[j], test); error = -1; goto exit; } @@ -731,105 +950,135 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; uint32_t *p2 = (uint32_t *)gIn2; uint32_t *p3 = (uint32_t *)gIn3; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { p[j] = genrand_int32(d); p2[j] = genrand_int32(d); p3[j] = genrand_int32(d); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_float ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -840,18 +1089,75 @@ exit: // A table of more difficult cases to get right static const double specialValuesDouble[] = { - -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0, + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, - +NAN, +INFINITY, +DBL_MAX, MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0, + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, }; -static const size_t specialValuesDoubleCount = sizeof( specialValuesDouble ) / sizeof( specialValuesDouble[0] ); +static const size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, @@ -860,8 +1166,8 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ; double maxErrorVal = 0.0f; @@ -869,7 +1175,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, double maxErrorVal3 = 0.0f; logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(double), bufferSize); Force64BitFPUPrecision(); @@ -877,360 +1183,463 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info ) )) + &build_info))) { return error; } /* for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; + if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, + programs + i) ) ) return error; */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array double *p = (double *)gIn; double *p2 = (double *)gIn2; double *p3 = (double *)gIn3; j = 0; - if( i == 0 ) + if (i == 0) { // test edge cases - uint32_t x, y, z; x = y = z = 0; - for( ; j < bufferSize / sizeof( double ); j++ ) + uint32_t x, y, z; + x = y = z = 0; + for (; j < bufferSize / sizeof(double); j++) { p[j] = specialValuesDouble[x]; p2[j] = specialValuesDouble[y]; p3[j] = specialValuesDouble[z]; - if( ++x >= specialValuesDoubleCount ) + if (++x >= specialValuesDoubleCount) { x = 0; - if( ++y >= specialValuesDoubleCount ) + if (++y >= specialValuesDoubleCount) { y = 0; - if( ++z >= specialValuesDoubleCount ) - break; + if (++z >= specialValuesDoubleCount) break; } } } - if( j == bufferSize / sizeof( double ) ) - vlog_error( "Test Error: not all special cases tested!\n" ); + if (j == bufferSize / sizeof(double)) + vlog_error("Test Error: not all special cases tested!\n"); } - for( ; j < bufferSize / sizeof( double ); j++ ) + for (; j < bufferSize / sizeof(double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = DoubleFromUInt32(genrand_int32(d)); p3[j] = DoubleFromUInt32(genrand_int32(d)); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result double *r = (double *)gOut_Ref; double *s = (double *)gIn; double *s2 = (double *)gIn2; double *s3 = (double *)gIn3; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - r[j] = (double) f->dfunc.f_fff( s[j], s2[j], s3[j] ); + for (j = 0; j < bufferSize / sizeof(double); j++) + r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint64_t *t = (uint64_t *)gOut_Ref; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint64_t *q = (uint64_t *)(gOut[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - double test = ((double*) q)[j]; - long double correct = f->dfunc.f_fff( s[j], s2[j], s3[j] ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - int fail = ! (fabsf(err) <= f->double_ulps); + double test = ((double *)q)[j]; + long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= f->double_ulps); - if( fail && ftz ) + if (fail && ftz) { // retry per section 6.5.3.2 - if( IsDoubleSubnormal(correct) ) + if (IsDoubleSubnormal(correct)) { // look at me, - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( fail && IsDoubleSubnormal( s[j] ) ) + if (fail && IsDoubleSubnormal(s[j])) { // look at me, - long double correct2 = f->dfunc.f_fff( 0.0, s2[j], s3[j] ); - long double correct3 = f->dfunc.f_fff( -0.0, s2[j], s3[j] ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = + f->dfunc.f_fff(0.0, s2[j], s3[j]); + long double correct3 = + f->dfunc.f_fff(-0.0, s2[j], s3[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) { // look at me now, - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } - //try with first two args as zero - if( IsDoubleSubnormal( s2[j] ) ) + // try with first two args as zero + if (IsDoubleSubnormal(s2[j])) { // its fun to have fun, - correct2 = f->dfunc.f_fff( 0.0, 0.0, s3[j] ); - correct3 = f->dfunc.f_fff( -0.0, 0.0, s3[j] ); - long double correct4 = f->dfunc.f_fff( 0.0, -0.0, s3[j] ); - long double correct5 = f->dfunc.f_fff( -0.0, -0.0, s3[j] ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]); + correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]); + long double correct4 = + f->dfunc.f_fff(0.0, -0.0, s3[j]); + long double correct5 = + f->dfunc.f_fff(-0.0, -0.0, s3[j]); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps)) + && (!(fabsf(err4) <= f->double_ulps)) + && (!(fabsf(err5) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct5, + f->double_ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } - if( IsDoubleSubnormal( s3[j] ) ) + if (IsDoubleSubnormal(s3[j])) { // but you have to know how! - correct2 = f->dfunc.f_fff( 0.0, 0.0, 0.0f ); - correct3 = f->dfunc.f_fff( -0.0, 0.0, 0.0f ); - correct4 = f->dfunc.f_fff( 0.0, -0.0, 0.0f ); - correct5 = f->dfunc.f_fff( -0.0, -0.0, 0.0f ); - long double correct6 = f->dfunc.f_fff( 0.0, 0.0, -0.0f ); - long double correct7 = f->dfunc.f_fff( -0.0, 0.0, -0.0f ); - long double correct8 = f->dfunc.f_fff( 0.0, -0.0, -0.0f ); - long double correct9 = f->dfunc.f_fff( -0.0, -0.0, -0.0f ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - float err6 = Bruteforce_Ulp_Error_Double( test, correct6 ); - float err7 = Bruteforce_Ulp_Error_Double( test, correct7 ); - float err8 = Bruteforce_Ulp_Error_Double( test, correct8 ); - float err9 = Bruteforce_Ulp_Error_Double( test, correct9 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps)) && - (!(fabsf(err5) <= f->double_ulps)) && (!(fabsf(err6) <= f->double_ulps)) && - (!(fabsf(err7) <= f->double_ulps)) && (!(fabsf(err8) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; - if( fabsf( err6 ) < fabsf(err ) ) - err = err6; - if( fabsf( err7 ) < fabsf(err ) ) - err = err7; - if( fabsf( err8 ) < fabsf(err ) ) - err = err8; - if( fabsf( err9 ) < fabsf(err ) ) - err = err9; + correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f); + correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f); + correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f); + correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f); + long double correct6 = + f->dfunc.f_fff(0.0, 0.0, -0.0f); + long double correct7 = + f->dfunc.f_fff(-0.0, 0.0, -0.0f); + long double correct8 = + f->dfunc.f_fff(0.0, -0.0, -0.0f); + long double correct9 = + f->dfunc.f_fff(-0.0, -0.0, -0.0f); + err2 = Bruteforce_Ulp_Error_Double( + test, correct2); + err3 = Bruteforce_Ulp_Error_Double( + test, correct3); + err4 = Bruteforce_Ulp_Error_Double( + test, correct4); + err5 = Bruteforce_Ulp_Error_Double( + test, correct5); + float err6 = Bruteforce_Ulp_Error_Double( + test, correct6); + float err7 = Bruteforce_Ulp_Error_Double( + test, correct7); + float err8 = Bruteforce_Ulp_Error_Double( + test, correct8); + float err9 = Bruteforce_Ulp_Error_Double( + test, correct9); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) + <= f->double_ulps)) + && (!(fabsf(err4) + <= f->double_ulps)) + && (!(fabsf(err5) + <= f->double_ulps)) + && (!(fabsf(err5) + <= f->double_ulps)) + && (!(fabsf(err6) + <= f->double_ulps)) + && (!(fabsf(err7) + <= f->double_ulps)) + && (!(fabsf(err8) + <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (fabsf(err6) < fabsf(err)) err = err6; + if (fabsf(err7) < fabsf(err)) err = err7; + if (fabsf(err8) < fabsf(err)) err = err8; + if (fabsf(err9) < fabsf(err)) err = err9; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) || - IsDoubleResultSubnormal( correct6, f->double_ulps ) || IsDoubleResultSubnormal( correct7, f->double_ulps ) || - IsDoubleResultSubnormal( correct8, f->double_ulps ) || IsDoubleResultSubnormal( correct9, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal( + correct3, f->double_ulps) + || IsDoubleResultSubnormal( + correct4, f->double_ulps) + || IsDoubleResultSubnormal( + correct5, f->double_ulps) + || IsDoubleResultSubnormal( + correct6, f->double_ulps) + || IsDoubleResultSubnormal( + correct7, f->double_ulps) + || IsDoubleResultSubnormal( + correct8, f->double_ulps) + || IsDoubleResultSubnormal( + correct9, f->double_ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - else if( IsDoubleSubnormal( s3[j] ) ) + else if (IsDoubleSubnormal(s3[j])) { - correct2 = f->dfunc.f_fff( 0.0, s2[j], 0.0 ); - correct3 = f->dfunc.f_fff( -0.0, s2[j], 0.0 ); - long double correct4 = f->dfunc.f_fff( 0.0, s2[j], -0.0 ); - long double correct5 = f->dfunc.f_fff( -0.0, s2[j], -0.0 ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0); + correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0); + long double correct4 = + f->dfunc.f_fff(0.0, s2[j], -0.0); + long double correct5 = + f->dfunc.f_fff(-0.0, s2[j], -0.0); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps)) + && (!(fabsf(err4) <= f->double_ulps)) + && (!(fabsf(err5) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct5, + f->double_ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - else if( fail && IsDoubleSubnormal( s2[j] ) ) + else if (fail && IsDoubleSubnormal(s2[j])) { - long double correct2 = f->dfunc.f_fff( s[j], 0.0, s3[j] ); - long double correct3 = f->dfunc.f_fff( s[j], -0.0, s3[j] ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = + f->dfunc.f_fff(s[j], 0.0, s3[j]); + long double correct3 = + f->dfunc.f_fff(s[j], -0.0, s3[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } - //try with second two args as zero - if( IsDoubleSubnormal( s3[j] ) ) + // try with second two args as zero + if (IsDoubleSubnormal(s3[j])) { - correct2 = f->dfunc.f_fff( s[j], 0.0, 0.0 ); - correct3 = f->dfunc.f_fff( s[j], -0.0, 0.0 ); - long double correct4 = f->dfunc.f_fff( s[j], 0.0, -0.0 ); - long double correct5 = f->dfunc.f_fff( s[j], -0.0, -0.0 ); - err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err4 = Bruteforce_Ulp_Error_Double( test, correct4 ); - float err5 = Bruteforce_Ulp_Error_Double( test, correct5 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps)) && - (!(fabsf(err4) <= f->double_ulps)) && (!(fabsf(err5) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; - if( fabsf( err4 ) < fabsf(err ) ) - err = err4; - if( fabsf( err5 ) < fabsf(err ) ) - err = err5; + correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0); + correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0); + long double correct4 = + f->dfunc.f_fff(s[j], 0.0, -0.0); + long double correct5 = + f->dfunc.f_fff(s[j], -0.0, -0.0); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps)) + && (!(fabsf(err4) <= f->double_ulps)) + && (!(fabsf(err5) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) || - IsDoubleResultSubnormal( correct4, f->double_ulps ) || IsDoubleResultSubnormal( correct5, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct5, + f->double_ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - else if( fail && IsDoubleSubnormal(s3[j]) ) + else if (fail && IsDoubleSubnormal(s3[j])) { - long double correct2 = f->dfunc.f_fff( s[j], s2[j], 0.0 ); - long double correct3 = f->dfunc.f_fff( s[j], s2[j], -0.0 ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= f->double_ulps)) && (!(fabsf(err3) <= f->double_ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = + f->dfunc.f_fff(s[j], s2[j], 0.0); + long double correct3 = + f->dfunc.f_fff(s[j], s2[j], -0.0); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; @@ -1238,9 +1647,12 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, maxErrorVal3 = s3[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %sD%s: %f ulp error at {%.13la, %.13la, %.13la}: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, s[j], s2[j], s3[j], ((double*) gOut_Ref)[j], test ); + vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, " + "%.13la, %.13la}: *%.13la vs. %.13la\n", + f->name, sizeNames[k], err, s[j], s2[j], + s3[j], ((double *)gOut_Ref)[j], test); error = -1; goto exit; } @@ -1248,107 +1660,136 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array double *p = (double *)gIn; double *p2 = (double *)gIn2; double *p3 = (double *)gIn3; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { p[j] = DoubleFromUInt32(genrand_int32(d)); p2[j] = DoubleFromUInt32(genrand_int32(d)); p3[j] = DoubleFromUInt32(genrand_int32(d)); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, bufferSize, gIn2, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); return error; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, bufferSize, gIn3, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - size_t vectorSize = sizeof( cl_double ) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer2 ), &gInBuffer2 ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 3, sizeof( gInBuffer3 ), &gInBuffer3 ) )) { LogBuildError(programs[j]); goto exit; } + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, maxErrorVal3 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -1356,5 +1797,3 @@ exit: return error; } - - diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp index 8ef33119..61a8546b 100644 --- a/test_conformance/math_brute_force/unary.cpp +++ b/test_conformance/math_brute_force/unary.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -18,8 +18,8 @@ #include #include "FunctionList.h" -#if defined( __APPLE__ ) - #include +#if defined(__APPLE__) +#include #endif int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode); @@ -37,61 +37,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " f0 = ", name, "( f0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " f0 = ", + name, + "( f0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -101,63 +117,80 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in)\n" + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in)\n" "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 f0 = vload3( 0, in + 3 * i );\n" - " f0 = ", name, "( f0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 f0 = vload3( 0, in + 3 * i );\n" + " f0 = ", + name, + "( f0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, relaxedMode); @@ -165,91 +198,102 @@ static int BuildKernelDouble(const char *name, int vectorSize, typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i, info->relaxedMode); } -//Thread specific data for a worker thread +// Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread - float maxError; // max error value. Init to 0. - double maxErrorValue; // position of the max error value. Init to 0. - cl_command_queue tQueue; // per thread command queue to improve performance -}ThreadInfo; + cl_mem inBuf; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double maxErrorValue; // position of the max error value. Init to 0. + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; typedef struct TestInfo { - size_t subBufferSize; // Size of the sub-buffer in elements - const Func *f; // A pointer to the function info - cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes - cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id] - ThreadInfo *tinfo; // An array of thread specific information for each worker thread - cl_uint threadCount; // Number of worker threads - cl_uint jobCount; // Number of jobs - cl_uint step; // step between each chunk and the next. - cl_uint scale; // stride between individual test values - float ulps; // max_allowed ulps - int ftz; // non-zero if running in flush to zero mode + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode - int isRangeLimited; // 1 if the function is only to be evaluated over a range - float half_sin_cos_tan_limit; + int isRangeLimited; // 1 if the function is only to be evaluated over a + // range + float half_sin_cos_tan_limit; bool relaxedMode; // True if test is to be run in relaxed mode, false // otherwise. -}TestInfo; +} TestInfo; -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p ); +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0); logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_float)); if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -259,69 +303,89 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode) test_info.f = f; test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); test_info.relaxedMode = relaxedMode; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } - } // Check for special cases for unary float test_info.isRangeLimited = 0; test_info.half_sin_cos_tan_limit = 0; - if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") ) + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) { test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = 1.0f + test_info.ulps * (FLT_EPSILON/2.0f); // out of range results from finite inputs must be in [-1,1] + test_info.half_sin_cos_tan_limit = 1.0f + + test_info.ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] } - else if( 0 == strcmp( f->name, "half_tan")) + else if (0 == strcmp(f->name, "half_tan")) { test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = INFINITY; // out of range resut from finite inputs must be numeric + test_info.half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric } // Init the kernels @@ -330,141 +394,156 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) goto exit; } - if( !gSkipCorrectnessTesting || skipTestingRelaxed) + if (!gSkipCorrectnessTesting || skipTestingRelaxed) { - error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; } } - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); - if( skipTestingRelaxed ) + if (skipTestingRelaxed) { - vlog(" (rlx skip correctness testing)\n"); - goto exit; + vlog(" (rlx skip correctness testing)\n"); + goto exit; } } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) ) - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) - ((float*)p)[j] = (float) genrand_real1(d); - else if( strstr( f->name, "log" ) ) - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) + if (strstr(f->name, "exp") || strstr(f->name, "sin") + || strstr(f->name, "cos") || strstr(f->name, "tan")) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) + ((float *)p)[j] = (float)genrand_real1(d); + else if (strstr(f->name, "log")) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) p[j] = genrand_int32(d) & 0x7fffffff; else - for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ ) + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) p[j] = genrand_int32(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError( test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double current_time = SubtractTime( endTime, startTime ); + double current_time = SubtractTime(endTime, startTime); sum += current_time; - if( current_time < bestTime ) - bestTime = current_time; + if (current_time < bestTime) bestTime = current_time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ %a", maxError, maxErrorVal ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { clReleaseMemObject(test_info.tinfo[i].inBuf); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } -static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_float ); + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint scale = job->scale; - cl_uint base = job_id * (cl_uint) job->step; + cl_uint base = job_id * (cl_uint)job->step; ThreadInfo *tinfo = job->tinfo + thread_id; - fptr func = job->f->func; - const char * fname = job->f->name; + fptr func = job->f->func; + const char *fname = job->f->name; bool relaxedMode = job->relaxedMode; float ulps = getAllowedUlpError(job->f, relaxedMode); if (relaxedMode) @@ -480,153 +559,177 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) int ftz = job->ftz; // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_uint *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_uint *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (uint32_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); // Write the new values to the input array - cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) + cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) { - p[j] = base + j * scale; - if (relaxedMode) - { - float p_j = *(float *) &p[j]; - if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 ) //the domain of the function is [-pi,pi] + p[j] = base + j * scale; + if (relaxedMode) { - if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN; - } + float p_j = *(float *)&p[j]; + if (strcmp(fname, "sin") == 0 + || strcmp(fname, "cos") + == 0) // the domain of the function is [-pi,pi] + { + if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN; + } - if ( strcmp( fname, "reciprocal" ) == 0 ) - { - const float l_limit = HEX_FLT(+, 1, 0, -, 126); - const float u_limit = HEX_FLT(+, 1, 0, +, 126); + if (strcmp(fname, "reciprocal") == 0) + { + const float l_limit = HEX_FLT(+, 1, 0, -, 126); + const float u_limit = HEX_FLT(+, 1, 0, +, 126); - if (fabs(p_j) < l_limit - || fabs(p_j) - > u_limit) // the domain of the function is [2^-126,2^126] - ((float *)p)[j] = NAN; + if (fabs(p_j) < l_limit + || fabs(p_j) > u_limit) // the domain of the function is + // [2^-126,2^126] + ((float *)p)[j] = NAN; + } } - } } - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); return error; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); return error; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); return error; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); return error; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result float *r = (float *)gOut_Ref + thread_id * buffer_elements; float *s = (float *)p; - for( j = 0; j < buffer_elements; j++ ) - r[j] = (float) func.f_f( s[j] ); + for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (uint32_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Wait for the last buffer - out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); return error; } - //Verify data + // Verify data uint32_t *t = (uint32_t *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - float test = ((float*) q)[j]; - double correct = func.f_f( s[j] ); - float err = Ulp_Error( test, correct ); - float abs_error = Abs_Error( test, correct ); + float test = ((float *)q)[j]; + double correct = func.f_f(s[j]); + float err = Ulp_Error(test, correct); + float abs_error = Abs_Error(test, correct); int fail = 0; int use_abs_error = 0; - // it is possible for the output to not match the reference result but for Ulp_Error - // to be zero, for example -1.#QNAN vs. 1.#QNAN. In such cases there is no failure + // it is possible for the output to not match the reference + // result but for Ulp_Error to be zero, for example -1.#QNAN + // vs. 1.#QNAN. In such cases there is no failure if (err == 0.0f) { fail = 0; } else if (relaxedMode) { - if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 ) + if (strcmp(fname, "sin") == 0 || strcmp(fname, "cos") == 0) { - fail = ! (fabsf(abs_error) <= ulps); + fail = !(fabsf(abs_error) <= ulps); use_abs_error = 1; } if (strcmp(fname, "sinpi") == 0 @@ -639,12 +742,12 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) } } - if ( strcmp(fname, "reciprocal") == 0 ) + if (strcmp(fname, "reciprocal") == 0) { - fail = ! (fabsf(err) <= ulps); + fail = !(fabsf(err) <= ulps); } - if ( strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0 ) + if (strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0) { float exp_error = ulps; @@ -653,153 +756,171 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) exp_error += floor(fabs(2 * s[j])); } - fail = ! (fabsf(err) <= exp_error); + fail = !(fabsf(err) <= exp_error); ulps = exp_error; } - if (strcmp(fname, "tan") == 0) { + if (strcmp(fname, "tan") == 0) + { - if( !gFastRelaxedDerived ) + if (!gFastRelaxedDerived) { - fail = ! (fabsf(err) <= ulps); + fail = !(fabsf(err) <= ulps); } - // Else fast math derived implementation does not require ULP verification + // Else fast math derived implementation does not + // require ULP verification } if (strcmp(fname, "exp10") == 0) { - if( !gFastRelaxedDerived ) + if (!gFastRelaxedDerived) { - fail = ! (fabsf(err) <= ulps); + fail = !(fabsf(err) <= ulps); } - // Else fast math derived implementation does not require ULP verification + // Else fast math derived implementation does not + // require ULP verification } if (strcmp(fname, "log") == 0 || strcmp(fname, "log2") == 0 || strcmp(fname, "log10") == 0) { - if( s[j] >= 0.5 && s[j] <= 2 ) + if (s[j] >= 0.5 && s[j] <= 2) { - fail = ! (fabsf(abs_error) <= ulps ); + fail = !(fabsf(abs_error) <= ulps); } else { - ulps = gIsEmbedded ? job->f->float_embedded_ulps : job->f->float_ulps; - fail = ! (fabsf(err) <= ulps); + ulps = gIsEmbedded ? job->f->float_embedded_ulps + : job->f->float_ulps; + fail = !(fabsf(err) <= ulps); } } // fast-relaxed implies finite-only - if( IsFloatInfinity(correct) || IsFloatNaN(correct) || - IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) ) { + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + { fail = 0; err = 0; } } else { - fail = ! (fabsf(err) <= ulps); + fail = !(fabsf(err) <= ulps); } // half_sin/cos/tan are only valid between +-2**16, Inf, NaN - if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY ) + if (isRangeLimited + && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) + && fabsf(s[j]) < INFINITY) { - if( fabsf( test ) <= half_sin_cos_tan_limit ) + if (fabsf(test) <= half_sin_cos_tan_limit) { err = 0; fail = 0; } } - if( fail ) + if (fail) { - if( ftz ) + if (ftz) { - typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold. + typedef int (*CheckForSubnormal)( + double, float); // If we are in fast relaxed math, + // we have a different calculation + // for the subnormal threshold. CheckForSubnormal isFloatResultSubnormalPtr; if (relaxedMode) { - isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError; + isFloatResultSubnormalPtr = + &IsFloatResultSubnormalAbsError; } else { - isFloatResultSubnormalPtr = &IsFloatResultSubnormal; + isFloatResultSubnormalPtr = &IsFloatResultSubnormal; } // retry per section 6.5.3.2 - if( (*isFloatResultSubnormalPtr)(correct, ulps) ) + if ((*isFloatResultSubnormalPtr)(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsFloatSubnormal( s[j] ) ) + if (IsFloatSubnormal(s[j])) { - double correct2 = func.f_f( 0.0 ); - double correct3 = func.f_f( -0.0 ); + double correct2 = func.f_f(0.0); + double correct3 = func.f_f(-0.0); float err2; float err3; - if( use_abs_error ) + if (use_abs_error) { - err2 = Abs_Error( test, correct2 ); - err3 = Abs_Error( test, correct3 ); + err2 = Abs_Error(test, correct2); + err3 = Abs_Error(test, correct3); } else { - err2 = Ulp_Error( test, correct2 ); - err3 = Ulp_Error( test, correct3 ); + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); } - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( (*isFloatResultSubnormalPtr)(correct2, ulps ) || (*isFloatResultSubnormalPtr)(correct3, ulps ) ) + if ((*isFloatResultSubnormalPtr)(correct2, ulps) + || (*isFloatResultSubnormalPtr)(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at %a (0x%8.8x): *%a vs. %a\n", job->f->name, sizeNames[k], err, ((float*) s)[j], ((uint32_t*) s)[j], ((float*) t)[j], test); + vlog_error("\nERROR: %s%s: %f ulp error at %a (0x%8.8x): " + "*%a vs. %a\n", + job->f->name, sizeNames[k], err, ((float *)s)[j], + ((uint32_t *)s)[j], ((float *)t)[j], test); return -1; } } } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else { - vlog("." ); + vlog("."); } fflush(stdout); } @@ -808,17 +929,16 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data ) } - -static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *) data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof( cl_double ); + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint scale = job->scale; - cl_uint base = job_id * (cl_uint) job->step; + cl_uint base = job_id * (cl_uint)job->step; ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - dptr func = job->f->dfunc; + float ulps = job->ulps; + dptr func = job->f->dfunc; cl_uint j, k; cl_int error; int ftz = job->ftz; @@ -826,190 +946,221 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) Force64BitFPUPrecision(); // start the map of the output arrays - cl_event e[ VECTOR_SIZE_COUNT ]; - cl_ulong *out[ VECTOR_SIZE_COUNT ]; - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + cl_event e[VECTOR_SIZE_COUNT]; + cl_ulong *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error); - if( error || NULL == out[j]) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); // Write the new values to the input array - cl_double *p = (cl_double*) gIn + thread_id * buffer_elements; - for( j = 0; j < buffer_elements; j++ ) - p[j] = DoubleFromUInt32( base + j * scale); + cl_double *p = (cl_double *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + p[j] = DoubleFromUInt32(base + j * scale); - if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); return error; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - //Wait for the map to finish - if( (error = clWaitForEvents(1, e + j) )) + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) { - vlog_error( "Error: clWaitForEvents failed! err: %d\n", error ); + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); return error; } - if( (error = clReleaseEvent( e[j] ) )) + if ((error = clReleaseEvent(e[j]))) { - vlog_error( "Error: clReleaseEvent failed! err: %d\n", error ); + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); return error; } - // Fill the result buffer with garbage, so that old results don't carry over + // Fill the result buffer with garbage, so that old results don't carry + // over uint32_t pattern = 0xffffdead; memset_pattern4(out[j], &pattern, buffer_size); - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) )) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error ); + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); return error; } // run the kernel - size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel cl_program program = job->programs[j]; - if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; } - if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; } - - if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); return error; } } // Get that moving - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 2 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - if( gSkipCorrectnessTesting ) - return CL_SUCCESS; + if (gSkipCorrectnessTesting) return CL_SUCCESS; - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements; cl_double *s = (cl_double *)p; - for( j = 0; j < buffer_elements; j++ ) - r[j] = (cl_double) func.f_f( s[j] ); + for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]); - // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue. - for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ ) + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) { - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); return error; } } // Wait for the last buffer - out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); - if( error || NULL == out[j] ) + out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) { - vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error ); + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); return error; } - //Verify data + // Verify data cl_ulong *t = (cl_ulong *)r; - for( j = 0; j < buffer_elements; j++ ) + for (j = 0; j < buffer_elements; j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_ulong *q = out[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - cl_double test = ((cl_double*) q)[j]; - long double correct = func.f_f( s[j] ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - int fail = ! (fabsf(err) <= ulps); + cl_double test = ((cl_double *)q)[j]; + long double correct = func.f_f(s[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= ulps); - if( fail ) + if (fail) { - if( ftz ) + if (ftz) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, ulps) ) + if (IsDoubleResultSubnormal(correct, ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsDoubleSubnormal( s[j] ) ) + if (IsDoubleSubnormal(s[j])) { - long double correct2 = func.f_f( 0.0L ); - long double correct3 = func.f_f( -0.0L ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct2 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct3 ); - fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps))); - if( fabsf( err2 ) < fabsf(err ) ) - err = err2; - if( fabsf( err3 ) < fabsf(err ) ) - err = err3; + long double correct2 = func.f_f(0.0L); + long double correct3 = func.f_f(-0.0L); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal(correct2, ulps ) || IsDoubleResultSubnormal(correct3, ulps ) ) + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) { - fail = fail && ( test != 0.0f); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } } - if( fabsf(err ) > tinfo->maxError ) + if (fabsf(err) > tinfo->maxError) { tinfo->maxError = fabsf(err); tinfo->maxErrorValue = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: %f ulp error at %.13la (0x%16.16llx): *%.13la vs. %.13la\n", job->f->name, sizeNames[k], err, ((cl_double*) gIn)[j], ((cl_ulong*) gIn)[j], ((cl_double*) gOut_Ref)[j], test ); + vlog_error("\nERROR: %s%s: %f ulp error at %.13la " + "(0x%16.16llx): *%.13la vs. %.13la\n", + job->f->name, sizeNames[k], err, + ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j], + ((cl_double *)gOut_Ref)[j], test); return -1; } } } } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) ) + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) { - vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error ); + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); return error; } } - if( (error = clFlush(tinfo->tQueue) )) - vlog( "clFlush 3 failed\n" ); + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if( 0 == ( base & 0x0fffffff) ) + if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, buffer_elements, job->scale, job->ulps, job->threadCount); - } else + vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, buffer_elements, job->scale, job->ulps, + job->threadCount); + } + else { - vlog("." ); + vlog("."); } fflush(stdout); } @@ -1019,33 +1170,36 @@ static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data ) int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode) { - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; -#if defined( __APPLE__ ) - struct timeval time_val; - gettimeofday( &time_val, NULL ); + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; +#if defined(__APPLE__) + struct timeval time_val; + gettimeofday(&time_val, NULL); double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec; double end_time; #endif logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); // Init test_info - memset( &test_info, 0, sizeof( test_info ) ); + memset(&test_info, 0, sizeof(test_info)); test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); test_info.scale = getTestScale(sizeof(cl_double)); if (gWimpyMode) { - test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); } - test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale; + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; if (test_info.step / test_info.subBufferSize != test_info.scale) { - //there was overflow + // there was overflow test_info.jobCount = 1; } else @@ -1058,52 +1212,69 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode) test_info.ftz = f->ftz || gForceFTZ; test_info.relaxedMode = relaxedMode; - // cl_kernels aren't thread safe, so we make one for each vector size for every thread - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { - size_t array_size = test_info.threadCount * sizeof( cl_kernel ); - test_info.k[i] = (cl_kernel*)malloc( array_size ); - if( NULL == test_info.k[i] ) + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) { - vlog_error( "Error: Unable to allocate storage for kernels!\n" ); + vlog_error("Error: Unable to allocate storage for kernels!\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.k[i], 0, array_size ); + memset(test_info.k[i], 0, array_size); } - test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) ); - if( NULL == test_info.tinfo ) + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) { - vlog_error( "Error: Unable to allocate storage for thread specific data.\n" ); + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); error = CL_OUT_OF_HOST_MEMORY; goto exit; } - memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) ); - for( i = 0; i < test_info.threadCount; i++ ) + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) { - cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) }; - test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if( error || NULL == test_info.tinfo[i].inBuf) + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); goto exit; } - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */ - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + /* Qualcomm fix: 9461 read-write flags must be compatible with + * parent buffer */ + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); /* Qualcomm fix: end */ - if( error || NULL == test_info.tinfo[i].outBuf[j] ) + if (error || NULL == test_info.tinfo[i].outBuf[j]) { - vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size ); + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); goto exit; } } - test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error); - if( NULL == test_info.tinfo[i].tQueue || error ) + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); goto exit; } } @@ -1114,136 +1285,147 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode) gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) - goto exit; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; } - if( !gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info ); + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); // Accumulate the arithmetic errors - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { - if( test_info.tinfo[i].maxError > maxError ) + if (test_info.tinfo[i].maxError > maxError) { maxError = test_info.tinfo[i].maxError; maxErrorVal = test_info.tinfo[i].maxErrorValue; } } - if( error ) - goto exit; + if (error) goto exit; - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } -#if defined( __APPLE__ ) - gettimeofday( &time_val, NULL); +#if defined(__APPLE__) + gettimeofday(&time_val, NULL); end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec; #endif - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array double *p = (double *)gIn; - if( strstr( f->name, "exp" ) ) - for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ ) + if (strstr(f->name, "exp")) + for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) p[j] = (double)genrand_real1(d); - else if( strstr( f->name, "log" ) ) - for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ ) - p[j] = fabs(DoubleFromUInt32( genrand_int32(d))); + else if (strstr(f->name, "log")) + for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) + p[j] = fabs(DoubleFromUInt32(genrand_int32(d))); else - for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ ) - p[j] = DoubleFromUInt32( genrand_int32(d) ); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) )) + for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; } - if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; } + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( i = 0; i < PERF_LOOP_COUNT; i++ ) + for (i = 0; i < PERF_LOOP_COUNT; i++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double current_time = SubtractTime( endTime, startTime ); + double current_time = SubtractTime(endTime, startTime); sum += current_time; - if( current_time < bestTime ) - bestTime = current_time; + if (current_time < bestTime) bestTime = current_time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ %a", maxError, maxErrorVal ); + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); -#if defined( __APPLE__ ) - vlog( "\t(%2.2f seconds)", end_time - start_time ); +#if defined(__APPLE__) + vlog("\t(%2.2f seconds)", end_time - start_time); #endif - vlog( "\n" ); + vlog("\n"); exit: - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) { clReleaseProgram(test_info.programs[i]); - if( test_info.k[i] ) + if (test_info.k[i]) { - for( j = 0; j < test_info.threadCount; j++ ) + for (j = 0; j < test_info.threadCount; j++) clReleaseKernel(test_info.k[i][j]); - free( test_info.k[i] ); + free(test_info.k[i]); } } - if( test_info.tinfo ) + if (test_info.tinfo) { - for( i = 0; i < test_info.threadCount; i++ ) + for (i = 0; i < test_info.threadCount; i++) { clReleaseMemObject(test_info.tinfo[i].inBuf); - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) clReleaseMemObject(test_info.tinfo[i].outBuf[j]); clReleaseCommandQueue(test_info.tinfo[i].tQueue); } - free( test_info.tinfo ); + free(test_info.tinfo); } return error; } - - diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp index b170e095..d468d26d 100644 --- a/test_conformance/math_brute_force/unary_two_results.cpp +++ b/test_conformance/math_brute_force/unary_two_results.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -32,64 +32,83 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i], out2 + i );\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* out2, __global float", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i], out2 + i );\n" + "}\n" }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* out2, __global float* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 iout = NAN;\n" - " f0 = ", name, "( f0, &iout );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( iout, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " float3 iout = NAN;\n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0, &iout );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = iout.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = iout.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* out2, __global float* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 iout = NAN;\n" + " f0 = ", + name, + "( f0, &iout );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( iout, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 iout = NAN;\n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, &iout );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = iout.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = iout.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } @@ -98,91 +117,114 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i], out2 + i );\n" - "}\n" - }; + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* out2, __global double", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i], out2 + i );\n" + "}\n" }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* out2, __global double* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 f0 = vload3( 0, in + 3 * i );\n" - " double3 iout = NAN;\n" - " f0 = ", name, "( f0, &iout );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( iout, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " double3 iout = NAN;\n" - " double3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0, &iout );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = iout.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = iout.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* out2, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 f0 = vload3( 0, in + 3 * i );\n" + " double3 iout = NAN;\n" + " f0 = ", + name, + "( f0, &iout );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( iout, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 iout = NAN;\n" + " double3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, &iout );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = iout.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = iout.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); @@ -194,20 +236,20 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) uint32_t j, k; uint32_t l; int error; - char const * testing_mode; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + char const *testing_mode; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError0 = 0.0f; float maxError1 = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); float maxErrorVal0 = 0.0f; float maxErrorVal1 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(float), bufferSize); - int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1); - cl_uchar overflow[BUFFER_SIZE / sizeof( float )]; - int isFract = 0 == strcmp( "fract", f->nameInCode ); - int skipNanInf = isFract && ! gInfNanSupport; + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1); + cl_uchar overflow[BUFFER_SIZE / sizeof(float)]; + int isFract = 0 == strcmp("fract", f->nameInCode); + int skipNanInf = isFract && !gInfNanSupport; float float_ulps = getAllowedUlpError(f, relaxedMode); logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); @@ -215,222 +257,256 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) return error; -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, + programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - if( gWimpyMode ) + if (gWimpyMode) { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - { - p[j] = (uint32_t) i + j * scale; - if (relaxedMode && strcmp(f->name, "sincos") == 0) + for (j = 0; j < bufferSize / sizeof(float); j++) { - float pj = *(float *)&p[j]; - if (fabs(pj) > M_PI) ((float *)p)[j] = NAN; + p[j] = (uint32_t)i + j * scale; + if (relaxedMode && strcmp(f->name, "sincos") == 0) + { + float pj = *(float *)&p[j]; + if (fabs(pj) > M_PI) ((float *)p)[j] = NAN; + } } - } } else { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - { - p[j] = (uint32_t) i + j; - if (relaxedMode && strcmp(f->name, "sincos") == 0) + for (j = 0; j < bufferSize / sizeof(float); j++) { - float pj = *(float *)&p[j]; - if (fabs(pj) > M_PI) ((float *)p)[j] = NAN; + p[j] = (uint32_t)i + j; + if (relaxedMode && strcmp(f->name, "sincos") == 0) + { + float pj = *(float *)&p[j]; + if (fabs(pj) > M_PI) ((float *)p)[j] = NAN; + } } - } } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } memset_pattern4(gOut2[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL))) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); FPU_mode_type oldMode; RoundingMode oldRoundMode = kRoundToNearestEven; - if( isFract ) + if (isFract) { - //Calculate the correctly rounded reference result - memset( &oldMode, 0, sizeof( oldMode ) ); - if( ftz ) - ForceFTZ( &oldMode ); + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); // Set the rounding mode to match the device if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); } - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result float *r = (float *)gOut_Ref; float *r2 = (float *)gOut_Ref2; float *s = (float *)gIn; - if( skipNanInf ) + if (skipNanInf) { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { double dd; feclearexcept(FE_OVERFLOW); if (relaxedMode) - r[j] = (float) f->rfunc.f_fpf( s[j], &dd ); + r[j] = (float)f->rfunc.f_fpf(s[j], &dd); else - r[j] = (float) f->func.f_fpf( s[j], &dd ); + r[j] = (float)f->func.f_fpf(s[j], &dd); - r2[j] = (float) dd; - overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + r2[j] = (float)dd; + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); } } else { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { double dd; if (relaxedMode) r[j] = (float)f->rfunc.f_fpf(s[j], &dd); else - r[j] = (float) f->func.f_fpf( s[j], &dd ); + r[j] = (float)f->func.f_fpf(s[j], &dd); - r2[j] = (float) dd; + r2[j] = (float)dd; } } - if( isFract && ftz ) - RestoreFPState( &oldMode ); + if (isFract && ftz) RestoreFPState(&oldMode); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "ReadArray2 failed %d\n", error ); + vlog_error("ReadArray2 failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) + if (gSkipCorrectnessTesting) { - if (isFract && gIsInRTZMode) - (void)set_round(oldRoundMode, kfloat); + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); break; } - //Verify data + // Verify data uint32_t *t = (uint32_t *)gOut_Ref; uint32_t *t2 = (uint32_t *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = (uint32_t *)gOut[k]; uint32_t *q2 = (uint32_t *)gOut2[k]; // If we aren't getting the correctly rounded result - if( t[j] != q[j] || t2[j] != q2[j] ) + if (t[j] != q[j] || t2[j] != q2[j]) { double correct, correct2; float err, err2; - float test = ((float*) q)[j]; - float test2 = ((float*) q2)[j]; + float test = ((float *)q)[j]; + float test2 = ((float *)q2)[j]; if (relaxedMode) correct = f->rfunc.f_fpf(s[j], &correct2); else - correct = f->func.f_fpf( s[j], &correct2 ); + correct = f->func.f_fpf(s[j], &correct2); - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow if (relaxedMode || skipNanInf) { - if (skipNanInf && overflow[j]) - continue; + if (skipNanInf && overflow[j]) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correct) || IsFloatNaN(correct) || - IsFloatInfinity(correct2)|| IsFloatNaN(correct2) || - IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) ) + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) || IsFloatNaN(correct2) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) continue; } - typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold. + typedef int (*CheckForSubnormal)( + double, float); // If we are in fast relaxed math, we + // have a different calculation for the + // subnormal threshold. CheckForSubnormal isFloatResultSubnormalPtr; if (relaxedMode) { - err = Abs_Error( test, correct); - err2 = Abs_Error( test2, correct2); - isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError; + err = Abs_Error(test, correct); + err2 = Abs_Error(test2, correct2); + isFloatResultSubnormalPtr = + &IsFloatResultSubnormalAbsError; } else { - err = Ulp_Error( test, correct ); - err2 = Ulp_Error( test2, correct2 ); + err = Ulp_Error(test, correct); + err2 = Ulp_Error(test2, correct2); isFloatResultSubnormalPtr = &IsFloatResultSubnormal; } - int fail = ! (fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps); + int fail = !(fabsf(err) <= float_ulps + && fabsf(err2) <= float_ulps); - if( ftz ) + if (ftz) { // retry per section 6.5.3.2 - if( (*isFloatResultSubnormalPtr)(correct, float_ulps) ) + if ((*isFloatResultSubnormalPtr)(correct, float_ulps)) { - if( (*isFloatResultSubnormalPtr) (correct2, float_ulps )) + if ((*isFloatResultSubnormalPtr)(correct2, + float_ulps)) { - fail = fail && ! ( test == 0.0f && test2 == 0.0f ); - if( ! fail ) + fail = fail && !(test == 0.0f && test2 == 0.0f); + if (!fail) { err = 0.0f; err2 = 0.0f; @@ -438,209 +514,251 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) } else { - fail = fail && ! ( test == 0.0f && fabsf(err2) <= float_ulps); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && fabsf(err2) <= float_ulps); + if (!fail) err = 0.0f; } } - else if( (*isFloatResultSubnormalPtr)(correct2, float_ulps ) ) + else if ((*isFloatResultSubnormalPtr)(correct2, + float_ulps)) { - fail = fail && ! ( test2 == 0.0f && fabsf(err) <= float_ulps); - if( ! fail ) - err2 = 0.0f; + fail = fail + && !(test2 == 0.0f && fabsf(err) <= float_ulps); + if (!fail) err2 = 0.0f; } // retry per section 6.5.3.3 - if( IsFloatSubnormal( s[j] ) ) + if (IsFloatSubnormal(s[j])) { double correctp, correctn; double correct2p, correct2n; float errp, err2p, errn, err2n; - if( skipNanInf ) - feclearexcept(FE_OVERFLOW); + if (skipNanInf) feclearexcept(FE_OVERFLOW); if (relaxedMode) { - correctp = f->rfunc.f_fpf( 0.0, &correct2p ); - correctn = f->rfunc.f_fpf( -0.0, &correct2n ); + correctp = f->rfunc.f_fpf(0.0, &correct2p); + correctn = f->rfunc.f_fpf(-0.0, &correct2n); } else { - correctp = f->func.f_fpf( 0.0, &correct2p ); - correctn = f->func.f_fpf( -0.0, &correct2n ); + correctp = f->func.f_fpf(0.0, &correct2p); + correctn = f->func.f_fpf(-0.0, &correct2n); } - // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow - if( skipNanInf ) + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (skipNanInf) { - if( fetestexcept(FE_OVERFLOW) ) - continue; + if (fetestexcept(FE_OVERFLOW)) continue; - // Note: no double rounding here. Reference functions calculate in single precision. - if( IsFloatInfinity(correctp) || IsFloatNaN(correctp) || - IsFloatInfinity(correctn) || IsFloatNaN(correctn) || - IsFloatInfinity(correct2p) || IsFloatNaN(correct2p) || - IsFloatInfinity(correct2n) || IsFloatNaN(correct2n) ) + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correctp) + || IsFloatNaN(correctp) + || IsFloatInfinity(correctn) + || IsFloatNaN(correctn) + || IsFloatInfinity(correct2p) + || IsFloatNaN(correct2p) + || IsFloatInfinity(correct2n) + || IsFloatNaN(correct2n)) continue; } if (relaxedMode) { - errp = Abs_Error( test, correctp ); - err2p = Abs_Error( test, correct2p ); - errn = Abs_Error( test, correctn ); - err2n = Abs_Error( test, correct2n ); + errp = Abs_Error(test, correctp); + err2p = Abs_Error(test, correct2p); + errn = Abs_Error(test, correctn); + err2n = Abs_Error(test, correct2n); } else { - errp = Ulp_Error( test, correctp ); - err2p = Ulp_Error( test, correct2p ); - errn = Ulp_Error( test, correctn ); - err2n = Ulp_Error( test, correct2n ); + errp = Ulp_Error(test, correctp); + err2p = Ulp_Error(test, correct2p); + errn = Ulp_Error(test, correctn); + err2n = Ulp_Error(test, correct2n); } - fail = fail && ((!(fabsf(errp) <= float_ulps)) && (!(fabsf(err2p) <= float_ulps)) && - ((!(fabsf(errn) <= float_ulps)) && (!(fabsf(err2n) <= float_ulps))) ); - if( fabsf( errp ) < fabsf(err ) ) - err = errp; - if( fabsf( errn ) < fabsf(err ) ) - err = errn; - if( fabsf( err2p ) < fabsf(err2 ) ) - err2 = err2p; - if( fabsf( err2n ) < fabsf(err2 ) ) - err2 = err2n; + fail = fail + && ((!(fabsf(errp) <= float_ulps)) + && (!(fabsf(err2p) <= float_ulps)) + && ((!(fabsf(errn) <= float_ulps)) + && (!(fabsf(err2n) <= float_ulps)))); + if (fabsf(errp) < fabsf(err)) err = errp; + if (fabsf(errn) < fabsf(err)) err = errn; + if (fabsf(err2p) < fabsf(err2)) err2 = err2p; + if (fabsf(err2n) < fabsf(err2)) err2 = err2n; // retry per section 6.5.3.4 - if( (*isFloatResultSubnormalPtr)( correctp, float_ulps ) || (*isFloatResultSubnormalPtr)( correctn, float_ulps ) ) + if ((*isFloatResultSubnormalPtr)(correctp, + float_ulps) + || (*isFloatResultSubnormalPtr)(correctn, + float_ulps)) { - if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) ) - { - fail = fail && !( test == 0.0f && test2 == 0.0f); - if( ! fail ) - err = err2 = 0.0f; - } - else - { - fail = fail && ! (test == 0.0f && fabsf(err2) <= float_ulps); - if( ! fail ) - err = 0.0f; - } + if ((*isFloatResultSubnormalPtr)(correct2p, + float_ulps) + || (*isFloatResultSubnormalPtr)(correct2n, + float_ulps)) + { + fail = fail + && !(test == 0.0f && test2 == 0.0f); + if (!fail) err = err2 = 0.0f; + } + else + { + fail = fail + && !(test == 0.0f + && fabsf(err2) <= float_ulps); + if (!fail) err = 0.0f; + } } - else if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) ) + else if ((*isFloatResultSubnormalPtr)(correct2p, + float_ulps) + || (*isFloatResultSubnormalPtr)( + correct2n, float_ulps)) { - fail = fail && ! (test2 == 0.0f && (fabsf(err) <= float_ulps)); - if( ! fail ) - err2 = 0.0f; + fail = fail + && !(test2 == 0.0f + && (fabsf(err) <= float_ulps)); + if (!fail) err2 = 0.0f; } } } - if( fabsf(err ) > maxError0 ) + if (fabsf(err) > maxError0) { maxError0 = fabsf(err); maxErrorVal0 = s[j]; } - if( fabsf(err2 ) > maxError1 ) + if (fabsf(err2) > maxError1) { maxError1 = fabsf(err2); maxErrorVal1 = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: {%f, %f} ulp error at %a: *{%a, %a} vs. {%a, %a}\n", f->name, sizeNames[k], err, err2, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((float*) gOut_Ref2)[j], test, test2 ); - error = -1; - goto exit; + vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: " + "*{%a, %a} vs. {%a, %a}\n", + f->name, sizeNames[k], err, err2, + ((float *)gIn)[j], ((float *)gOut_Ref)[j], + ((float *)gOut_Ref2)[j], test, test2); + error = -1; + goto exit; } } } } - if (isFract && gIsInRTZMode) - (void)set_round(oldRoundMode, kfloat); + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("."); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) p[j] = genrand_int32(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, + maxErrorVal1); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -654,16 +772,16 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError0 = 0.0f; float maxError1 = 0.0f; int ftz = f->ftz || gForceFTZ; double maxErrorVal0 = 0.0f; double maxErrorVal1 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_double), bufferSize); - int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1); logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); @@ -672,135 +790,163 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) { return error; } -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + + i, programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array double *p = (double *)gIn; - if( gWimpyMode ) + if (gWimpyMode) { - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) - p[j] = DoubleFromUInt32((uint32_t) i + j * scale); + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j * scale); } else { - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) - p[j] = DoubleFromUInt32((uint32_t) i + j); + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } memset_pattern4(gOut2[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL))) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result double *r = (double *)gOut_Ref; double *r2 = (double *)gOut_Ref2; double *s = (double *)gIn; - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) + for (j = 0; j < bufferSize / sizeof(cl_double); j++) { long double dd; - r[j] = (double) f->dfunc.f_fpf( s[j], &dd ); - r2[j] = (double) dd; + r[j] = (double)f->dfunc.f_fpf(s[j], &dd); + r2[j] = (double)dd; } // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "ReadArray2 failed %d\n", error ); + vlog_error("ReadArray2 failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint64_t *t = (uint64_t *)gOut_Ref; uint64_t *t2 = (uint64_t *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint64_t *q = (uint64_t *)(gOut[k]); uint64_t *q2 = (uint64_t *)(gOut2[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] || t2[j] != q2[j] ) + if (t[j] != q[j] || t2[j] != q2[j]) { - double test = ((double*) q)[j]; - double test2 = ((double*) q2)[j]; + double test = ((double *)q)[j]; + double test2 = ((double *)q2)[j]; long double correct2; - long double correct = f->dfunc.f_fpf( s[j], &correct2 ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - float err2 = Bruteforce_Ulp_Error_Double( test2, correct2 ); - int fail = ! (fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps); - if( ftz ) + long double correct = f->dfunc.f_fpf(s[j], &correct2); + float err = Bruteforce_Ulp_Error_Double(test, correct); + float err2 = Bruteforce_Ulp_Error_Double(test2, correct2); + int fail = !(fabsf(err) <= f->double_ulps + && fabsf(err2) <= f->double_ulps); + if (ftz) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct, f->double_ulps)) { - if( IsDoubleResultSubnormal( correct2, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2, + f->double_ulps)) { - fail = fail && ! ( test == 0.0f && test2 == 0.0f ); - if( ! fail ) + fail = fail && !(test == 0.0f && test2 == 0.0f); + if (!fail) { err = 0.0f; err2 = 0.0f; @@ -808,168 +954,214 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) } else { - fail = fail && ! ( test == 0.0f && fabsf(err2) <= f->double_ulps); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && fabsf(err2) <= f->double_ulps); + if (!fail) err = 0.0f; } } - else if( IsDoubleResultSubnormal( correct2, f->double_ulps ) ) + else if (IsDoubleResultSubnormal(correct2, + f->double_ulps)) { - fail = fail && ! ( test2 == 0.0f && fabsf(err) <= f->double_ulps); - if( ! fail ) - err2 = 0.0f; + fail = fail + && !(test2 == 0.0f + && fabsf(err) <= f->double_ulps); + if (!fail) err2 = 0.0f; } // retry per section 6.5.3.3 - if( IsDoubleSubnormal( s[j] ) ) + if (IsDoubleSubnormal(s[j])) { long double correct2p, correct2n; - long double correctp = f->dfunc.f_fpf( 0.0, &correct2p ); - long double correctn = f->dfunc.f_fpf( -0.0, &correct2n ); - float errp = Bruteforce_Ulp_Error_Double( test, correctp ); - float err2p = Bruteforce_Ulp_Error_Double( test, correct2p ); - float errn = Bruteforce_Ulp_Error_Double( test, correctn ); - float err2n = Bruteforce_Ulp_Error_Double( test, correct2n ); - fail = fail && ((!(fabsf(errp) <= f->double_ulps)) && (!(fabsf(err2p) <= f->double_ulps)) && - ((!(fabsf(errn) <= f->double_ulps)) && (!(fabsf(err2n) <= f->double_ulps))) ); - if( fabsf( errp ) < fabsf(err ) ) - err = errp; - if( fabsf( errn ) < fabsf(err ) ) - err = errn; - if( fabsf( err2p ) < fabsf(err2 ) ) - err2 = err2p; - if( fabsf( err2n ) < fabsf(err2 ) ) - err2 = err2n; + long double correctp = + f->dfunc.f_fpf(0.0, &correct2p); + long double correctn = + f->dfunc.f_fpf(-0.0, &correct2n); + float errp = + Bruteforce_Ulp_Error_Double(test, correctp); + float err2p = + Bruteforce_Ulp_Error_Double(test, correct2p); + float errn = + Bruteforce_Ulp_Error_Double(test, correctn); + float err2n = + Bruteforce_Ulp_Error_Double(test, correct2n); + fail = fail + && ((!(fabsf(errp) <= f->double_ulps)) + && (!(fabsf(err2p) <= f->double_ulps)) + && ((!(fabsf(errn) <= f->double_ulps)) + && (!(fabsf(err2n) + <= f->double_ulps)))); + if (fabsf(errp) < fabsf(err)) err = errp; + if (fabsf(errn) < fabsf(err)) err = errn; + if (fabsf(err2p) < fabsf(err2)) err2 = err2p; + if (fabsf(err2n) < fabsf(err2)) err2 = err2n; // retry per section 6.5.3.4 - if( IsDoubleResultSubnormal( correctp, f->double_ulps ) || IsDoubleResultSubnormal( correctn, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correctp, + f->double_ulps) + || IsDoubleResultSubnormal(correctn, + f->double_ulps)) { - if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct2p, + f->double_ulps) + || IsDoubleResultSubnormal(correct2n, + f->double_ulps)) { - fail = fail && !( test == 0.0f && test2 == 0.0f); - if( ! fail ) - err = err2 = 0.0f; + fail = fail + && !(test == 0.0f && test2 == 0.0f); + if (!fail) err = err2 = 0.0f; } else { - fail = fail && ! (test == 0.0f && fabsf(err2) <= f->double_ulps); - if( ! fail ) - err = 0.0f; + fail = fail + && !(test == 0.0f + && fabsf(err2) <= f->double_ulps); + if (!fail) err = 0.0f; } } - else if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) ) + else if (IsDoubleResultSubnormal(correct2p, + f->double_ulps) + || IsDoubleResultSubnormal(correct2n, + f->double_ulps)) { - fail = fail && ! (test2 == 0.0f && (fabsf(err) <= f->double_ulps)); - if( ! fail ) - err2 = 0.0f; + fail = fail + && !(test2 == 0.0f + && (fabsf(err) <= f->double_ulps)); + if (!fail) err2 = 0.0f; } } } - if( fabsf(err ) > maxError0 ) + if (fabsf(err) > maxError0) { maxError0 = fabsf(err); maxErrorVal0 = s[j]; } - if( fabsf(err2 ) > maxError1 ) + if (fabsf(err2) > maxError1) { maxError1 = fabsf(err2); maxErrorVal1 = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: *{%.13la, %.13la} vs. {%.13la, %.13la}\n", f->name, sizeNames[k], err, err2, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((double*) gOut_Ref2)[j], test, test2 ); - error = -1; - goto exit; + vlog_error( + "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: " + "*{%.13la, %.13la} vs. {%.13la, %.13la}\n", + f->name, sizeNames[k], err, err2, + ((double *)gIn)[j], ((double *)gOut_Ref)[j], + ((double *)gOut_Ref2)[j], test, test2); + error = -1; + goto exit; } } } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array - double *p = (double*) gIn; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - p[j] = DoubleFromUInt32(genrand_int32(d) ); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + // Init input array + double *p = (double *)gIn; + for (j = 0; j < bufferSize / sizeof(double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, + maxErrorVal1); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -977,6 +1169,3 @@ exit: return error; } - - - diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp index 15326882..c71de0ed 100644 --- a/test_conformance/math_brute_force/unary_two_results_i.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -34,63 +34,82 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i], out2 + i );\n" - "}\n" - }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " int3 iout = INT_MIN;\n" - " f0 = ", name, "( f0, &iout );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( iout, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " int3 iout = INT_MIN;\n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0, &iout );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = iout.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = iout.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global int", + sizeNames[vectorSize], + "* out2, __global float", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i], out2 + i );\n" + "}\n" }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global int* out2, __global float* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " int3 iout = INT_MIN;\n" + " f0 = ", + name, + "( f0, &iout );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( iout, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " int3 iout = INT_MIN;\n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, &iout );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = iout.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = iout.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } @@ -99,97 +118,120 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i], out2 + i );\n" - "}\n" - }; - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in)\n" + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global int", + sizeNames[vectorSize], + "* out2, __global double", + sizeNames[vectorSize], + "* in)\n" "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 f0 = vload3( 0, in + 3 * i );\n" - " int3 iout = INT_MIN;\n" - " f0 = ", name, "( f0, &iout );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( iout, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " int3 iout = INT_MIN;\n" - " double3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( f0, &iout );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = iout.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = iout.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i], out2 + i );\n" + "}\n" }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global int* out2, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 f0 = vload3( 0, in + 3 * i );\n" + " int3 iout = INT_MIN;\n" + " f0 = ", + name, + "( f0, &iout );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( iout, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " int3 iout = INT_MIN;\n" + " double3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, &iout );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = iout.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = iout.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -cl_ulong abs_cl_long( cl_long i ); -cl_ulong abs_cl_long( cl_long i ) +cl_ulong abs_cl_long(cl_long i); +cl_ulong abs_cl_long(cl_long i) { cl_long mask = i >> 63; return (i ^ mask) - mask; @@ -200,22 +242,22 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int64_t maxError2 = 0; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); float maxErrorVal = 0.0f; float maxErrorVal2 = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; float float_ulps; uint64_t step = getTestStep(sizeof(float), bufferSize); - int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1); - cl_ulong maxiError; + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1); + cl_ulong maxiError; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - if( gIsEmbedded ) + if (gIsEmbedded) float_ulps = f->float_embedded_ulps; else float_ulps = f->float_ulps; @@ -225,147 +267,179 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) return error; -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, + programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - if( gWimpyMode ) + if (gWimpyMode) { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - p[j] = (uint32_t) i + j * scale; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j * scale; } else { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - p[j] = (uint32_t) i + j; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } memset_pattern4(gOut2[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result float *r = (float *)gOut_Ref; int *r2 = (int *)gOut_Ref2; float *s = (float *)gIn; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - r[j] = (float) f->func.f_fpI( s[j], r2+j ); + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = (float)f->func.f_fpI(s[j], r2 + j); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "ReadArray2 failed %d\n", error ); + vlog_error("ReadArray2 failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint32_t *t = (uint32_t *)gOut_Ref; int32_t *t2 = (int32_t *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = (uint32_t *)(gOut[k]); int32_t *q2 = (int32_t *)(gOut2[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] || t2[j] != q2[j] ) + if (t[j] != q[j] || t2[j] != q2[j]) { - float test = ((float*) q)[j]; + float test = ((float *)q)[j]; int correct2 = INT_MIN; - double correct = f->func.f_fpI( s[j], &correct2 ); - float err = Ulp_Error( test, correct ); - cl_long iErr = (int64_t) q2[j] - (int64_t) correct2; - int fail = ! (fabsf(err) <= float_ulps && abs_cl_long( iErr ) <= maxiError ); - if( ftz ) + double correct = f->func.f_fpI(s[j], &correct2); + float err = Ulp_Error(test, correct); + cl_long iErr = (int64_t)q2[j] - (int64_t)correct2; + int fail = !(fabsf(err) <= float_ulps + && abs_cl_long(iErr) <= maxiError); + if (ftz) { // retry per section 6.5.3.2 - if( IsFloatResultSubnormal(correct, float_ulps ) ) + if (IsFloatResultSubnormal(correct, float_ulps)) { - fail = fail && ! ( test == 0.0f && iErr == 0 ); - if( ! fail ) - err = 0.0f; + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsFloatSubnormal( s[j] ) ) + if (IsFloatSubnormal(s[j])) { int correct5, correct6; - double correct3 = f->func.f_fpI( 0.0, &correct5 ); - double correct4 = f->func.f_fpI( -0.0, &correct6 ); - float err2 = Ulp_Error( test, correct3 ); - float err3 = Ulp_Error( test, correct4 ); - cl_long iErr2 = (long long) q2[j] - (long long) correct5; - cl_long iErr3 = (long long) q2[j] - (long long) correct6; + double correct3 = f->func.f_fpI(0.0, &correct5); + double correct4 = f->func.f_fpI(-0.0, &correct6); + float err2 = Ulp_Error(test, correct3); + float err3 = Ulp_Error(test, correct4); + cl_long iErr2 = + (long long)q2[j] - (long long)correct5; + cl_long iErr3 = + (long long)q2[j] - (long long)correct6; // Did +0 work? - if( fabsf(err2) <= float_ulps && abs_cl_long( iErr2 ) <= maxiError ) + if (fabsf(err2) <= float_ulps + && abs_cl_long(iErr2) <= maxiError) { err = err2; iErr = iErr2; fail = 0; } // Did -0 work? - else if(fabsf(err3) <= float_ulps && abs_cl_long( iErr3 ) <= maxiError) + else if (fabsf(err3) <= float_ulps + && abs_cl_long(iErr3) <= maxiError) { err = err3; iErr = iErr3; @@ -373,10 +447,17 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) } // retry per section 6.5.3.4 - if( fail && (IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps )) ) + if (fail + && (IsFloatResultSubnormal(correct2, float_ulps) + || IsFloatResultSubnormal(correct3, + float_ulps))) { - fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) ); - if( ! fail ) + fail = fail + && !(test == 0.0f + && (abs_cl_long(iErr2) <= maxiError + || abs_cl_long(iErr3) + <= maxiError)); + if (!fail) { err = 0.0f; iErr = 0; @@ -384,20 +465,24 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) } } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; } - if( llabs(iErr) > maxError2 ) + if (llabs(iErr) > maxError2) { - maxError2 = llabs(iErr ); + maxError2 = llabs(iErr); maxErrorVal2 = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %s%s: {%f, %d} ulp error at %a: *{%a, %d} vs. {%a, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] ); + vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: " + "*{%a, %d} vs. {%a, %d}\n", + f->name, sizeNames[k], err, (int)iErr, + ((float *)gIn)[j], ((float *)gOut_Ref)[j], + ((int *)gOut_Ref2)[j], test, q2[j]); error = -1; goto exit; } @@ -405,88 +490,109 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) p[j] = genrand_int32(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -500,18 +606,18 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int64_t maxError2 = 0; int ftz = f->ftz || gForceFTZ; double maxErrorVal = 0.0f; double maxErrorVal2 = 0.0f; - cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(double), bufferSize); - int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1); logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); @@ -520,151 +626,185 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) { return error; } -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + + i, programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array double *p = (double *)gIn; - if( gWimpyMode ) + if (gWimpyMode) { - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - p[j] = DoubleFromUInt32((uint32_t) i + j * scale); + for (j = 0; j < bufferSize / sizeof(double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j * scale); } else { - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - p[j] = DoubleFromUInt32((uint32_t) i + j); + for (j = 0; j < bufferSize / sizeof(double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j); } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } memset_pattern4(gOut2[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) )) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result + // Calculate the correctly rounded reference result double *r = (double *)gOut_Ref; int *r2 = (int *)gOut_Ref2; double *s = (double *)gIn; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - r[j] = (double) f->dfunc.f_fpI( s[j], r2+j ); + for (j = 0; j < bufferSize / sizeof(double); j++) + r[j] = (double)f->dfunc.f_fpI(s[j], r2 + j); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) ) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) { - vlog_error( "ReadArray2 failed %d\n", error ); + vlog_error("ReadArray2 failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data + // Verify data uint64_t *t = (uint64_t *)gOut_Ref; int32_t *t2 = (int32_t *)gOut_Ref2; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint64_t *q = (uint64_t *)(gOut[k]); int32_t *q2 = (int32_t *)(gOut2[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] || t2[j] != q2[j] ) + if (t[j] != q[j] || t2[j] != q2[j]) { - double test = ((double*) q)[j]; + double test = ((double *)q)[j]; int correct2 = INT_MIN; - long double correct = f->dfunc.f_fpI( s[j], &correct2 ); - float err = Bruteforce_Ulp_Error_Double( test, correct ); - cl_long iErr = (long long) q2[j] - (long long) correct2; - int fail = ! (fabsf(err) <= f->double_ulps && abs_cl_long( iErr ) <= maxiError ); - if( ftz ) + long double correct = f->dfunc.f_fpI(s[j], &correct2); + float err = Bruteforce_Ulp_Error_Double(test, correct); + cl_long iErr = (long long)q2[j] - (long long)correct2; + int fail = !(fabsf(err) <= f->double_ulps + && abs_cl_long(iErr) <= maxiError); + if (ftz) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, f->double_ulps ) ) + if (IsDoubleResultSubnormal(correct, f->double_ulps)) { - fail = fail && ! ( test == 0.0f && iErr == 0 ); - if( ! fail ) - err = 0.0f; + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; } // retry per section 6.5.3.3 - if( IsDoubleSubnormal( s[j] ) ) + if (IsDoubleSubnormal(s[j])) { int correct5, correct6; - long double correct3 = f->dfunc.f_fpI( 0.0, &correct5 ); - long double correct4 = f->dfunc.f_fpI( -0.0, &correct6 ); - float err2 = Bruteforce_Ulp_Error_Double( test, correct3 ); - float err3 = Bruteforce_Ulp_Error_Double( test, correct4 ); - cl_long iErr2 = (long long) q2[j] - (long long) correct5; - cl_long iErr3 = (long long) q2[j] - (long long) correct6; + long double correct3 = + f->dfunc.f_fpI(0.0, &correct5); + long double correct4 = + f->dfunc.f_fpI(-0.0, &correct6); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct4); + cl_long iErr2 = + (long long)q2[j] - (long long)correct5; + cl_long iErr3 = + (long long)q2[j] - (long long)correct6; // Did +0 work? - if( fabsf(err2) <= f->double_ulps && abs_cl_long( iErr2 ) <= maxiError ) + if (fabsf(err2) <= f->double_ulps + && abs_cl_long(iErr2) <= maxiError) { err = err2; iErr = iErr2; fail = 0; } // Did -0 work? - else if(fabsf(err3) <= f->double_ulps && abs_cl_long( iErr3 ) <= maxiError) + else if (fabsf(err3) <= f->double_ulps + && abs_cl_long(iErr3) <= maxiError) { err = err3; iErr = iErr3; @@ -672,10 +812,18 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) } // retry per section 6.5.3.4 - if( fail && (IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )) ) + if (fail + && (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps))) { - fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) ); - if( ! fail ) + fail = fail + && !(test == 0.0f + && (abs_cl_long(iErr2) <= maxiError + || abs_cl_long(iErr3) + <= maxiError)); + if (!fail) { err = 0.0f; iErr = 0; @@ -683,20 +831,24 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) } } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; } - if( llabs(iErr) > maxError2 ) + if (llabs(iErr) > maxError2) { - maxError2 = llabs(iErr ); + maxError2 = llabs(iErr); maxErrorVal2 = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\nERROR: %sD%s: {%f, %d} ulp error at %.13la: *{%.13la, %d} vs. {%.13la, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] ); + vlog_error("\nERROR: %sD%s: {%f, %d} ulp error at " + "%.13la: *{%.13la, %d} vs. {%.13la, %d}\n", + f->name, sizeNames[k], err, (int)iErr, + ((double *)gIn)[j], ((double *)gOut_Ref)[j], + ((int *)gOut_Ref2)[j], test, q2[j]); error = -1; goto exit; } @@ -704,91 +856,111 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array + // Init input array double *p = (double *)gIn; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) + for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = DoubleFromUInt32(genrand_int32(d)); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILED -- could not execute kernel\n" ); + vlog_error("FAILED -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -796,6 +968,3 @@ exit: return error; } - - - diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp index 97fd25f9..397ff877 100644 --- a/test_conformance/math_brute_force/unary_u.cpp +++ b/test_conformance/math_brute_force/unary_u.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -33,61 +33,77 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global uint", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global uint* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " uint3 u0 = vload3( 0, in + 3 * i );\n" - " float3 f0 = ", name, "( u0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " uint3 u0;\n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " f0 = ", name, "( u0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global uint", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global uint* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " uint3 u0 = vload3( 0, in + 3 * i );\n" + " float3 f0 = ", + name, + "( u0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " uint3 u0;\n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( u0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } @@ -95,90 +111,110 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { - const char *c[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global ulong", sizeNames[vectorSize], "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", name, "( in[i] );\n" - "}\n" - }; - - const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global ulong* in)\n" + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global ulong", + sizeNames[vectorSize], + "* in)\n" "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " ulong3 u0 = vload3( 0, in + 3 * i );\n" - " double3 f0 = ", name, "( u0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n" - " ulong3 u0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n" - " break;\n" - " case 0:\n" - " u0 = (ulong3)( in[3*i], in[3*i+1], 0xdeaddeaddeaddeadUL ); \n" - " break;\n" - " }\n" - " double3 f0 = ", name, "( u0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global ulong* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " ulong3 u0 = vload3( 0, in + 3 * i );\n" + " double3 f0 = ", + name, + "( u0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how " + "many elements are left over after BUFFER_SIZE % " + "(3*sizeof(float)). Assume power of two buffer size \n" + " ulong3 u0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " u0 = (ulong3)( in[3*i], " + "0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n" + " break;\n" + " case 0:\n" + " u0 = (ulong3)( in[3*i], in[3*i+1], " + "0xdeaddeaddeaddeadUL ); \n" + " break;\n" + " }\n" + " double3 f0 = ", + name, + "( u0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" }; const char **kern = c; - size_t kernSize = sizeof(c)/sizeof(c[0]); + size_t kernSize = sizeof(c) / sizeof(c[0]); - if( sizeValues[vectorSize] == 3 ) + if (sizeValues[vectorSize] == 3) { kern = c3; - kernSize = sizeof(c3)/sizeof(c3[0]); + kernSize = sizeof(c3) / sizeof(c3[0]); } char testName[32]; - snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] ); + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } typedef struct BuildKernelInfo { - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -}BuildKernelInfo; +} BuildKernelInfo; -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernel(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ); -static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p ) +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p); +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) { - BuildKernelInfo *info = (BuildKernelInfo*) p; + BuildKernelInfo *info = (BuildKernelInfo *)p; cl_uint i = info->offset + job_id; return BuildKernelDouble(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); @@ -189,22 +225,22 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); float maxErrorVal = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(float), bufferSize); - int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1); int isRangeLimited = 0; float float_ulps; float half_sin_cos_tan_limit = 0; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - if( gIsEmbedded) + if (gIsEmbedded) float_ulps = f->float_embedded_ulps; else float_ulps = f->float_ulps; @@ -212,240 +248,282 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) return error; -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, + programs + i) ) ) return error; + */ - if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") ) + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) { isRangeLimited = 1; - half_sin_cos_tan_limit = 1.0f + float_ulps * (FLT_EPSILON/2.0f); // out of range results from finite inputs must be in [-1,1] + half_sin_cos_tan_limit = 1.0f + + float_ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] } - else if( 0 == strcmp( f->name, "half_tan")) + else if (0 == strcmp(f->name, "half_tan")) { isRangeLimited = 1; - half_sin_cos_tan_limit = INFINITY; // out of range resut from finite inputs must be numeric + half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric } - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array uint32_t *p = (uint32_t *)gIn; - if( gWimpyMode ) + if (gWimpyMode) { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - p[j] = (uint32_t) i + j * scale; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j * scale; } else { - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - p[j] = (uint32_t) i + j; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j; } - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL))) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL))) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILURE -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILURE -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result - float *r = (float*) gOut_Ref; - cl_uint *s = (cl_uint*) gIn; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - r[j] = (float) f->func.f_u( s[j] ); + // Calculate the correctly rounded reference result + float *r = (float *)gOut_Ref; + cl_uint *s = (cl_uint *)gIn; + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = (float)f->func.f_u(s[j]); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL))) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data - uint32_t *t = (uint32_t*) gOut_Ref; - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + for (j = 0; j < bufferSize / sizeof(float); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { - uint32_t *q = (uint32_t*)(gOut[k]); + uint32_t *q = (uint32_t *)(gOut[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - float test = ((float*) q)[j]; - double correct = f->func.f_u( s[j] ); - float err = Ulp_Error( test, correct ); - int fail = ! (fabsf(err) <= float_ulps); + float test = ((float *)q)[j]; + double correct = f->func.f_u(s[j]); + float err = Ulp_Error(test, correct); + int fail = !(fabsf(err) <= float_ulps); // half_sin/cos/tan are only valid between +-2**16, Inf, NaN - if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY ) + if (isRangeLimited + && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) + && fabsf(s[j]) < INFINITY) { - if( fabsf( test ) <= half_sin_cos_tan_limit ) + if (fabsf(test) <= half_sin_cos_tan_limit) { err = 0; fail = 0; } } - if( fail ) + if (fail) { - if( ftz ) + if (ftz) { // retry per section 6.5.3.2 - if( IsFloatResultSubnormal(correct, float_ulps) ) + if (IsFloatResultSubnormal(correct, float_ulps)) { - fail = fail && ( test != 0.0f ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", f->name, sizeNames[k], err, ((uint32_t*) gIn)[j], ((float*) gOut_Ref)[j], test ); - error = -1; + vlog_error( + "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", + f->name, sizeNames[k], err, ((uint32_t *)gIn)[j], + ((float *)gOut_Ref)[j], test); + error = -1; goto exit; } } } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array - uint32_t *p = (uint32_t*)gIn; - if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) ) - for( j = 0; j < bufferSize / sizeof( float ); j++ ) - ((float*)p)[j] = (float) genrand_real1(d); - else if( strstr( f->name, "log" ) ) - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + // Init input array + uint32_t *p = (uint32_t *)gIn; + if (strstr(f->name, "exp") || strstr(f->name, "sin") + || strstr(f->name, "cos") || strstr(f->name, "tan")) + for (j = 0; j < bufferSize / sizeof(float); j++) + ((float *)p)[j] = (float)genrand_real1(d); + else if (strstr(f->name, "log")) + for (j = 0; j < bufferSize / sizeof(float); j++) p[j] = genrand_int32(d) & 0x7fffffff; else - for( j = 0; j < bufferSize / sizeof( float ); j++ ) + for (j = 0; j < bufferSize / sizeof(float); j++) p[j] = genrand_int32(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_float); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILURE -- could not execute kernel\n" ); + vlog_error("FAILURE -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); } } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ %a", maxError, maxErrorVal ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -454,9 +532,9 @@ exit: return error; } -static cl_ulong random64( MTdata d ) +static cl_ulong random64(MTdata d) { - return (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32); + return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32); } int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) @@ -464,12 +542,12 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) uint64_t i; uint32_t j, k; int error; - cl_program programs[ VECTOR_SIZE_COUNT ]; - cl_kernel kernels[ VECTOR_SIZE_COUNT ]; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ; double maxErrorVal = 0.0f; - size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_double), bufferSize); logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); @@ -479,211 +557,243 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) // Init the kernels BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode, relaxedMode }; - if( (error = ThreadPool_Do( BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info ) )) + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) { return error; } -/* - for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) - if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) ) - return error; -*/ + /* + for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ ) + if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + + i, programs + i) ) ) return error; + */ - for( i = 0; i < (1ULL<<32); i += step ) + for (i = 0; i < (1ULL << 32); i += step) { - //Init input array + // Init input array cl_ulong *p = (cl_ulong *)gIn; - for( j = 0; j < bufferSize / sizeof( cl_ulong ); j++ ) - p[j] = random64(d); + for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL))) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // write garbage into output arrays - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); - if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL))) + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); goto exit; } } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } - - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL))) + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) { - vlog_error( "FAILURE -- could not execute kernel\n" ); + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILURE -- could not execute kernel\n"); goto exit; } } // Get that moving - if( (error = clFlush(gQueue) )) - vlog( "clFlush failed\n" ); + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - //Calculate the correctly rounded reference result - double *r = (double*) gOut_Ref; - cl_ulong *s = (cl_ulong*) gIn; - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) - r[j] = (double) f->dfunc.f_u( s[j] ); + // Calculate the correctly rounded reference result + double *r = (double *)gOut_Ref; + cl_ulong *s = (cl_ulong *)gIn; + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + r[j] = (double)f->dfunc.f_u(s[j]); // Read the data back - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL))) + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) { - vlog_error( "ReadArray failed %d\n", error ); + vlog_error("ReadArray failed %d\n", error); goto exit; } } - if( gSkipCorrectnessTesting ) - break; + if (gSkipCorrectnessTesting) break; - //Verify data - uint64_t *t = (uint64_t*) gOut_Ref; - for( j = 0; j < bufferSize / sizeof( cl_double ); j++ ) + // Verify data + uint64_t *t = (uint64_t *)gOut_Ref; + for (j = 0; j < bufferSize / sizeof(cl_double); j++) { - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { - uint64_t *q = (uint64_t*)(gOut[k]); + uint64_t *q = (uint64_t *)(gOut[k]); // If we aren't getting the correctly rounded result - if( t[j] != q[j] ) + if (t[j] != q[j]) { - double test = ((double*) q)[j]; - long double correct = f->dfunc.f_u( s[j] ); + double test = ((double *)q)[j]; + long double correct = f->dfunc.f_u(s[j]); float err = Bruteforce_Ulp_Error_Double(test, correct); - int fail = ! (fabsf(err) <= f->double_ulps); + int fail = !(fabsf(err) <= f->double_ulps); // half_sin/cos/tan are only valid between +-2**16, Inf, NaN - if( fail ) + if (fail) { - if( ftz ) + if (ftz) { // retry per section 6.5.3.2 - if( IsDoubleResultSubnormal(correct, f->double_ulps) ) + if (IsDoubleResultSubnormal(correct, + f->double_ulps)) { - fail = fail && ( test != 0.0 ); - if( ! fail ) - err = 0.0f; + fail = fail && (test != 0.0); + if (!fail) err = 0.0f; } } } - if( fabsf(err ) > maxError ) + if (fabsf(err) > maxError) { maxError = fabsf(err); maxErrorVal = s[j]; } - if( fail ) + if (fail) { - vlog_error( "\n%s%sD: %f ulp error at 0x%16.16llx: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, ((uint64_t*) gIn)[j], ((double*) gOut_Ref)[j], test ); - error = -1; + vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: " + "*%.13la vs. %.13la\n", + f->name, sizeNames[k], err, + ((uint64_t *)gIn)[j], + ((double *)gOut_Ref)[j], test); + error = -1; goto exit; } } } } - if( 0 == (i & 0x0fffffff) ) + if (0 == (i & 0x0fffffff)) { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize); - } else - { - vlog("." ); - } - fflush(stdout); + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); } } - if( ! gSkipCorrectnessTesting ) + if (!gSkipCorrectnessTesting) { - if( gWimpyMode ) - vlog( "Wimp pass" ); + if (gWimpyMode) + vlog("Wimp pass"); else - vlog( "passed" ); + vlog("passed"); } - if( gMeasureTimes ) + if (gMeasureTimes) { - //Init input array - double *p = (double*) gIn; + // Init input array + double *p = (double *)gIn; - for( j = 0; j < bufferSize / sizeof( double ); j++ ) - p[j] = random64(d); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) )) + for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) { - vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error ); + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; } // Run the kernels - for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ ) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_double); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; } - if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; } + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } double sum = 0.0; double bestTime = INFINITY; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) ) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) { - vlog_error( "FAILURE -- could not execute kernel\n" ); + vlog_error("FAILURE -- could not execute kernel\n"); goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) ); - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); } - for( ; j < gMaxVectorSizeIndex; j++ ) - vlog( "\t -- " ); + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); } - if( ! gSkipCorrectnessTesting ) - vlog( "\t%8.2f @ %a", maxError, maxErrorVal ); - vlog( "\n" ); + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); exit: // Release - for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ ) + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); @@ -691,4 +801,3 @@ exit: return error; } -