// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #ifndef UTILITY_H #define UTILITY_H #ifdef __APPLE__ #include #else #include #endif #include #if !defined(_MSC_VER) #include #endif #include #include #include "../../test_common/harness/rounding_mode.h" #include "../../test_common/harness/fpcontrol.h" #if defined( _WIN32) && defined (_MSC_VER) #include "../../test_common/harness/testHarness.h" #endif #include "../../test_common/harness/compat.h" #include "../../test_common/harness/ThreadPool.h" #define BUFFER_SIZE (1024*1024*2) #if defined( __GNUC__ ) #define UNUSED __attribute__ ((unused)) #else #define UNUSED #endif #define VECTOR_SIZE_COUNT 6 extern const char *sizeNames[VECTOR_SIZE_COUNT]; extern const int sizeValues[VECTOR_SIZE_COUNT]; extern cl_device_type gDeviceType; extern cl_device_id gDevice; extern cl_context gContext; extern cl_command_queue gQueue; extern void *gIn; extern void *gIn2; extern void *gIn3; extern void *gOut_Ref; extern void *gOut_Ref2; extern void *gOut[VECTOR_SIZE_COUNT]; extern void *gOut2[VECTOR_SIZE_COUNT]; extern cl_mem gInBuffer; extern cl_mem gInBuffer2; extern cl_mem gInBuffer3; extern cl_mem gOutBuffer[VECTOR_SIZE_COUNT]; extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT]; extern uint32_t gComputeDevices; extern uint32_t gSimdSize; extern int gSkipCorrectnessTesting; extern int gMeasureTimes; extern int gReportAverageTimes; extern int gForceFTZ; extern int gWimpyMode; extern int gHasDouble; extern int gIsInRTZMode; extern int gInfNanSupport; extern int gIsEmbedded; extern uint32_t gMaxVectorSizeIndex; extern uint32_t gMinVectorSizeIndex; extern uint32_t gDeviceFrequency; extern cl_device_fp_config gFloatCapabilities; extern cl_device_fp_config gDoubleCapabilities; #if !defined( _MSC_VER) #include #endif #define LOWER_IS_BETTER 0 #define HIGHER_IS_BETTER 1 #if USE_ATF #include #define test_start() ATFTestStart() #define test_finish() ATFTestFinish() #define vlog( ... ) ATFLogInfo(__VA_ARGS__) #define vlog_error( ... ) ATFLogError(__VA_ARGS__) #define vlog_perf( _number, _higherIsBetter, _units, _nameFmt, ... ) ATFLogPerformanceNumber(_number, _higherIsBetter, _units, _nameFmt, __VA_ARGS__ ) #else #define test_start() #define test_finish() #define vlog( ... ) printf( __VA_ARGS__ ) #define vlog_error( ... ) printf( __VA_ARGS__ ) #define vlog_perf( _number, _higherIsBetter, _units, _nameFmt, ... ) printf( "\t%8.2f", _number ) void _logPerf(double number, int higherIsBetter, const char *units, const char *nameFormat, ...); #endif #if defined (_MSC_VER ) //Deal with missing scalbn on windows #define scalbnf( _a, _i ) ldexpf( _a, _i ) #define scalbn( _a, _i ) ldexp( _a, _i ) #define scalbnl( _a, _i ) ldexpl( _a, _i ) #endif #ifdef __cplusplus extern "C" { #endif float Ulp_Error( float test, double reference ); //float Ulp_Error_Half( float test, double reference ); float Ulp_Error_Double( double test, long double reference ); #ifdef __cplusplus } //extern "C" #endif uint64_t GetTime( void ); double SubtractTime( uint64_t endTime, uint64_t startTime ); int MakeKernel( const char **c, cl_uint count, const char *name, cl_kernel *k, cl_program *p ); int MakeKernels( const char **c, cl_uint count, const char *name, cl_uint kernel_count, cl_kernel *k, cl_program *p ); // used to convert a bucket of bits into a search pattern through double static inline double DoubleFromUInt32( uint32_t bits ); static inline double DoubleFromUInt32( uint32_t bits ) { union{ uint64_t u; double d;} u; // split 0x89abcdef to 0x89abc00000000def u.u = bits & 0xfffU; u.u |= (uint64_t) (bits & ~0xfffU) << 32; // sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s u.u -= (bits & 0x800U) << 1; // return result return u.d; } void _LogBuildError( cl_program p, int line, const char *file ); #define LogBuildError( program ) _LogBuildError( program, __LINE__, __FILE__ ) #ifndef MAX #define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b)) #endif #ifndef MIN #define MIN(_a, _b) ((_a) < (_b) ? (_a) : (_b)) #endif #define PERF_LOOP_COUNT 100 // Note: though this takes a double, this is for use with single precision tests static inline int IsFloatSubnormal( double x ) { #if 2 == FLT_RADIX // Do this in integer to avoid problems with FTZ behavior union{ float d; uint32_t u;}u; u.d = fabsf((float)x); return (u.u-1) < 0x007fffffU; #else // rely on floating point hardware for non-radix2 non-IEEE-754 hardware -- will fail if you flush subnormals to zero return fabs(x) < (double) FLT_MIN && x != 0.0; #endif } static inline int IsDoubleSubnormal( long double x ) { #if 2 == FLT_RADIX // Do this in integer to avoid problems with FTZ behavior union{ double d; uint64_t u;}u; u.d = fabs((double) x); return (u.u-1) < 0x000fffffffffffffULL; #else // rely on floating point hardware for non-radix2 non-IEEE-754 hardware -- will fail if you flush subnormals to zero return fabs(x) < (double) DBL_MIN && x != 0.0; #endif } //The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero. // However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero. static inline int IsFloatResultSubnormal( double x, float ulps ) { x = fabs(x) - MAKE_HEX_DOUBLE( 0x1.0p-149, 0x1, -149) * (double) ulps; return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 ); } static inline int IsDoubleResultSubnormal( long double x, float ulps ) { x = fabsl(x) - MAKE_HEX_LONG( 0x1.0p-1074, 0x1, -1074) * (long double) ulps; return x < MAKE_HEX_LONG( 0x1.0p-1022, 0x1, -1022 ); } static inline int IsFloatInfinity(double x) { union { cl_float d; cl_uint u; } u; u.d = (cl_float) x; return ((u.u & 0x7fffffffU) == 0x7F800000U); } static inline int IsFloatMaxFloat(double x) { union { cl_float d; cl_uint u; } u; u.d = (cl_float) x; return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU); } static inline int IsFloatNaN(double x) { union { cl_float d; cl_uint u; } u; u.d = (cl_float) x; return ((u.u & 0x7fffffffU) > 0x7F800000U); } extern cl_uint RoundUpToNextPowerOfTwo( cl_uint x ); // Windows (since long double got deprecated) sets the x87 to 53-bit precision // (that's x87 default state). This causes problems with the tests that // convert long and ulong to float and double or otherwise deal with values // that need more precision than 53-bit. So, set the x87 to 64-bit precision. static inline void Force64BitFPUPrecision(void) { #if __MINGW32__ // The usual method is to use _controlfp as follows: // #include // _controlfp(_PC_64, _MCW_PC); // // _controlfp is available on MinGW32 but not on MinGW64. Instead of having // divergent code just use inline assembly which works for both. unsigned short int orig_cw = 0; unsigned short int new_cw = 0; __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw)); new_cw = orig_cw | 0x0300; // set precision to 64-bit __asm__ __volatile__ ("fldcw %0"::"m" (new_cw)); #else /* Implement for other platforms if needed */ #endif } #ifdef __cplusplus extern "C" #else extern #endif void memset_pattern4(void *dest, const void *src_pattern, size_t bytes ); typedef union { int32_t i; float f; }int32f_t; typedef union { int64_t l; double d; }int64d_t; void MulD(double *rhi, double *rlo, double u, double v); void AddD(double *rhi, double *rlo, double a, double b); void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl); void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl); void DivideDD(double *chi, double *clo, double a, double b); int compareFloats(float x, float y); int compareDoubles(double x, double y); #endif /* UTILITY_H */