mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-23 15:39:03 +00:00
Initial open source release of OpenCL 2.0 CTS.
This commit is contained in:
76
test_conformance/math_brute_force/CMakeLists.txt
Normal file
76
test_conformance/math_brute_force/CMakeLists.txt
Normal file
@@ -0,0 +1,76 @@
|
||||
set(MODULE_NAME BRUTEFORCE)
|
||||
|
||||
set(${MODULE_NAME}_SOURCES
|
||||
FunctionList.c
|
||||
Sleep.c
|
||||
binary.c
|
||||
binaryOperator.c
|
||||
Utility.c
|
||||
binary_i.c
|
||||
binary_two_results_i.c
|
||||
i_unary.c
|
||||
macro_binary.c
|
||||
macro_unary.c
|
||||
mad.c
|
||||
main.c
|
||||
reference_math.c
|
||||
ternary.c
|
||||
unary.c
|
||||
unary_two_results.c
|
||||
unary_two_results_i.c
|
||||
unary_u.c
|
||||
../../test_common/harness/rounding_mode.c
|
||||
../../test_common/harness/ThreadPool.c
|
||||
../../test_common/harness/mt19937.c
|
||||
../../test_common/harness/msvc9.c
|
||||
)
|
||||
|
||||
|
||||
if (NOT ANDROID)
|
||||
set_source_files_properties(
|
||||
FunctionList.c
|
||||
Sleep.c
|
||||
binary.c
|
||||
binaryOperator.c
|
||||
Utility.c
|
||||
binary_i.c
|
||||
binary_two_results_i.c
|
||||
i_unary.c
|
||||
macro_binary.c
|
||||
macro_unary.c
|
||||
mad.c
|
||||
main.c reference_math.c
|
||||
ternary.c unary.c unary_two_results.c
|
||||
unary_two_results_i.c unary_u.c
|
||||
COMPILE_FLAGS -msse2 )
|
||||
endif(NOT ANDROID)
|
||||
|
||||
set_source_files_properties(
|
||||
FunctionList.c
|
||||
Sleep.c
|
||||
binary.c
|
||||
binaryOperator.c
|
||||
Utility.c
|
||||
binary_i.c
|
||||
binary_two_results_i.c
|
||||
i_unary.c
|
||||
macro_binary.c
|
||||
macro_unary.c
|
||||
mad.c
|
||||
main.c
|
||||
reference_math.c
|
||||
ternary.c
|
||||
unary.c
|
||||
unary_two_results.c
|
||||
unary_two_results_i.c unary_u.c
|
||||
../../test_common/harness/rounding_mode.c
|
||||
../../test_common/harness/ThreadPool.c
|
||||
../../test_common/harness/msvc9.c
|
||||
PROPERTIES LANGUAGE CXX)
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
set_source_files_properties(
|
||||
COMPILE_FLAGS -O0)
|
||||
endif(CMAKE_COMPILER_IS_GNUCC)
|
||||
|
||||
include(../CMakeCommon.txt)
|
||||
180
test_conformance/math_brute_force/FunctionList.c
Normal file
180
test_conformance/math_brute_force/FunctionList.c
Normal file
@@ -0,0 +1,180 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "FunctionList.h"
|
||||
#include "reference_math.h"
|
||||
|
||||
#define FTZ_ON 1
|
||||
#define FTZ_OFF 0
|
||||
#define EXACT 0.0f
|
||||
#define RELAXED_ON 1
|
||||
#define RELAXED_OFF 0
|
||||
|
||||
#define STRINGIFY( _s) #_s
|
||||
|
||||
#define ENTRY( _name, _ulp, _embedded_ulp, _rmode, _type ) { STRINGIFY(_name), STRINGIFY(_name), {(void*)reference_##_name}, {(void*)reference_##_name##l}, {(void*)reference_##_name}, _ulp, _ulp, _embedded_ulp, INFINITY, _rmode, RELAXED_OFF, _type }
|
||||
#define ENTRY_EXT( _name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type ) { STRINGIFY(_name), STRINGIFY(_name), {(void*)reference_##_name}, {(void*)reference_##_name##l}, {(void*)reference_##relaxed_##_name}, _ulp, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, RELAXED_ON, _type }
|
||||
#define HALF_ENTRY( _name, _ulp, _embedded_ulp, _rmode, _type ) { "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), {(void*)reference_##_name}, {NULL}, {NULL}, _ulp, _ulp, _embedded_ulp, INFINITY, _rmode, RELAXED_OFF, _type }
|
||||
#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) { STRINGIFY(_name), _operator, {(void*)reference_##_name}, {(void*)reference_##_name##l}, {NULL}, _ulp, _ulp, _embedded_ulp, INFINITY, _rmode, RELAXED_OFF, _type }
|
||||
|
||||
#if defined( __cplusplus )
|
||||
extern "C" {
|
||||
#endif
|
||||
extern const vtbl _unary; // float foo( float )
|
||||
extern const vtbl _unary_u; // float foo( uint ), double foo( ulong )
|
||||
extern const vtbl _i_unary; // int foo( float )
|
||||
extern const vtbl _macro_unary; // int foo( float ), returns {0,1} for scalar, { 0, -1 } for vector
|
||||
extern const vtbl _binary; // float foo( float, float )
|
||||
extern const vtbl _binary_nextafter; // float foo( float, float ), special handling for nextafter
|
||||
extern const vtbl _binary_operator; // float .op. float
|
||||
extern const vtbl _macro_binary; // int foo( float, float ), returns {0,1} for scalar, { 0, -1 } for vector
|
||||
extern const vtbl _binary_i; // float foo( float, int )
|
||||
extern const vtbl _ternary; // float foo( float, float, float )
|
||||
extern const vtbl _unary_two_results; // float foo( float, float * )
|
||||
extern const vtbl _unary_two_results_i; // float foo( float, int * )
|
||||
extern const vtbl _binary_two_results_i; // float foo( float, float, int * )
|
||||
extern const vtbl _mad_tbl; // float mad( float, float, float )
|
||||
#if defined( __cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#define unaryF &_unary
|
||||
#define i_unaryF &_i_unary
|
||||
#define unaryF_u &_unary_u
|
||||
#define macro_unaryF &_macro_unary
|
||||
#define binaryF &_binary
|
||||
#define binaryF_nextafter &_binary_nextafter
|
||||
#define binaryOperatorF &_binary_operator
|
||||
#define binaryF_i &_binary_i
|
||||
#define macro_binaryF &_macro_binary
|
||||
#define ternaryF &_ternary
|
||||
#define unaryF_two_results &_unary_two_results
|
||||
#define unaryF_two_results_i &_unary_two_results_i
|
||||
#define binaryF_two_results_i &_binary_two_results_i
|
||||
#define mad_function &_mad_tbl
|
||||
|
||||
|
||||
const Func functionList[] = {
|
||||
ENTRY( acos, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( acosh, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( acospi, 5.0f, 5.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( asin, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( asinh, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( asinpi, 5.0f, 5.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( atan, 5.0f, 5.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( atanh, 5.0f, 5.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( atanpi, 5.0f, 5.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( atan2, 6.0f, 6.0f, FTZ_OFF, binaryF),
|
||||
ENTRY( atan2pi, 6.0f, 6.0f, FTZ_OFF, binaryF),
|
||||
ENTRY( cbrt, 2.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( ceil, 0.0f, 0.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( copysign, 0.0f, 0.0f, FTZ_OFF, binaryF),
|
||||
ENTRY_EXT( cos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF), //relaxed ulp 2^-11
|
||||
ENTRY( cosh, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( cospi, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
// ENTRY( erfc, 16.0f, 16.0f, FTZ_OFF, unaryF), //disabled for 1.0 due to lack of reference implementation
|
||||
// ENTRY( erf, 16.0f, 16.0f, FTZ_OFF, unaryF), //disabled for 1.0 due to lack of reference implementation
|
||||
ENTRY_EXT( exp, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF), //relaxed error is actually overwritten in unary.c as it is 3+floor(fabs(2*x))
|
||||
ENTRY_EXT( exp2, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF), //relaxed error is actually overwritten in unary.c as it is 3+floor(fabs(2*x))
|
||||
ENTRY_EXT( exp10, 3.0f, 4.0f, 8192.0f, FTZ_OFF, unaryF), //relaxed error is actually overwritten in unary.c as it is 3+floor(fabs(2*x)) in derived mode,
|
||||
// in non-derived mode it uses the ulp error for half_exp10.
|
||||
ENTRY( expm1, 3.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( fabs, 0.0f, 0.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( fdim, 0.0f, 0.0f, FTZ_OFF, binaryF),
|
||||
ENTRY( floor, 0.0f, 0.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( fma, 0.0f, 0.0f, FTZ_OFF, ternaryF),
|
||||
ENTRY( fmax, 0.0f, 0.0f, FTZ_OFF, binaryF),
|
||||
ENTRY( fmin, 0.0f, 0.0f, FTZ_OFF, binaryF),
|
||||
ENTRY( fmod, 0.0f, 0.0f, FTZ_OFF, binaryF ),
|
||||
ENTRY( fract, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
|
||||
ENTRY( frexp, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i),
|
||||
ENTRY( hypot, 4.0f, 4.0f, FTZ_OFF, binaryF),
|
||||
ENTRY( ilogb, 0.0f, 0.0f, FTZ_OFF, i_unaryF),
|
||||
ENTRY( isequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( isfinite, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
|
||||
ENTRY( isgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( isgreaterequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( isinf, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
|
||||
ENTRY( isless, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( islessequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( islessgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( isnan, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
|
||||
ENTRY( isnormal, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
|
||||
ENTRY( isnotequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( isordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( isunordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
|
||||
ENTRY( ldexp, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
|
||||
ENTRY( lgamma, INFINITY, INFINITY, FTZ_OFF, unaryF),
|
||||
ENTRY( lgamma_r, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i),
|
||||
ENTRY_EXT( log, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF), //relaxed ulp 2^-21
|
||||
ENTRY_EXT( log2, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF), //relaxed ulp 2^-21
|
||||
ENTRY( log10, 3.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( log1p, 2.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( logb, 0.0f, 0.0f, FTZ_OFF, unaryF),
|
||||
ENTRY_EXT( mad, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function), //in fast-relaxed-math mode it has to be either exactly rounded fma or exactly rounded a*b+c
|
||||
ENTRY( maxmag, 0.0f, 0.0f, FTZ_OFF, binaryF ),
|
||||
ENTRY( minmag, 0.0f, 0.0f, FTZ_OFF, binaryF ),
|
||||
ENTRY( modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results ),
|
||||
ENTRY( nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
|
||||
ENTRY( nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),
|
||||
ENTRY_EXT( pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF), //in derived mode the ulp error is calculated as exp2(y*log2(x)) and in non-derived it is the same as half_pow
|
||||
ENTRY( pown, 16.0f, 16.0f, FTZ_OFF, binaryF_i),
|
||||
ENTRY( powr, 16.0f, 16.0f, FTZ_OFF, binaryF),
|
||||
// ENTRY( reciprocal, 1.0f, 1.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( remainder, 0.0f, 0.0f, FTZ_OFF, binaryF),
|
||||
ENTRY( remquo, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i),
|
||||
ENTRY( rint, 0.0f, 0.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( rootn, 16.0f, 16.0f, FTZ_OFF, binaryF_i),
|
||||
ENTRY( round, 0.0f, 0.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( rsqrt, 2.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( signbit, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
|
||||
ENTRY_EXT( sin, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF), //relaxed ulp 2^-11
|
||||
ENTRY_EXT( sincos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results), //relaxed ulp 2^-11
|
||||
ENTRY( sinh, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( sinpi, 4.0f, 4.0f, FTZ_OFF, unaryF),
|
||||
{ "sqrt", "sqrt", {(void*)reference_sqrt}, {(void*)reference_sqrtl}, {NULL}, 3.0f, 0.0f, 4.0f, INFINITY, FTZ_OFF, RELAXED_OFF, unaryF },
|
||||
{ "sqrt_cr", "sqrt", {(void*)reference_sqrt}, {(void*)reference_sqrtl}, {NULL}, 0.0f, 0.0f, 0.0f, INFINITY, FTZ_OFF, RELAXED_OFF, unaryF },
|
||||
ENTRY_EXT( tan, 5.0f, 5.0f, 8192.0f, FTZ_OFF, unaryF), //in derived mode it the ulp error is calculated as sin/cos and in non-derived mode it is the same as half_tan.
|
||||
ENTRY( tanh, 5.0f, 5.0f, FTZ_OFF, unaryF),
|
||||
ENTRY( tanpi, 6.0f, 6.0f, FTZ_OFF, unaryF),
|
||||
// ENTRY( tgamma, 16.0f, 16.0f, FTZ_OFF, unaryF), // Commented this out until we can be sure this requirement is realistic
|
||||
ENTRY( trunc, 0.0f, 0.0f, FTZ_OFF, unaryF),
|
||||
|
||||
HALF_ENTRY( cos, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( divide, 8192.0f, 8192.0f, FTZ_ON, binaryF),
|
||||
HALF_ENTRY( exp, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( exp2, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( exp10, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( log, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( log2, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( log10, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( powr, 8192.0f, 8192.0f, FTZ_ON, binaryF),
|
||||
HALF_ENTRY( recip, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( sin, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
HALF_ENTRY( tan, 8192.0f, 8192.0f, FTZ_ON, unaryF),
|
||||
|
||||
// basic operations
|
||||
OPERATOR_ENTRY( add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
|
||||
OPERATOR_ENTRY( subtract, "-", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
|
||||
{ "divide", "/", {(void*)reference_divide}, {(void*)reference_dividel}, {(void*)reference_relaxed_divide}, 2.5f, 0.0f, 3.0f, 2.5f, FTZ_OFF, RELAXED_ON, binaryOperatorF },
|
||||
{ "divide_cr", "/", {(void*)reference_divide}, {(void*)reference_dividel}, {(void*)reference_relaxed_divide}, 0.0f, 0.0f, 0.0f, 0.f, FTZ_OFF, RELAXED_OFF, binaryOperatorF },
|
||||
OPERATOR_ENTRY( multiply, "*", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
|
||||
OPERATOR_ENTRY( assignment, "", 0.0f, 0.0f, FTZ_OFF, unaryF), // A simple copy operation
|
||||
OPERATOR_ENTRY( not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
|
||||
};
|
||||
|
||||
const size_t functionListCount = sizeof( functionList ) / sizeof( functionList[0] );
|
||||
99
test_conformance/math_brute_force/FunctionList.h
Normal file
99
test_conformance/math_brute_force/FunctionList.h
Normal file
@@ -0,0 +1,99 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef FUNCTIONLIST_H
|
||||
#define FUNCTIONLIST_H
|
||||
|
||||
#include "../../test_common/harness/compat.h"
|
||||
|
||||
#ifndef WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined( __APPLE__ )
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#include "../../test_common/harness/mt19937.h"
|
||||
|
||||
typedef union fptr
|
||||
{
|
||||
void *p;
|
||||
double (*f_f)(double);
|
||||
double (*f_u)(cl_uint);
|
||||
int (*i_f)(double);
|
||||
int (*i_f_f)(float);
|
||||
float (*f_ff_f)(float, float);
|
||||
double (*f_ff)(double, double);
|
||||
int (*i_ff)(double, double);
|
||||
double (*f_fi)(double, int);
|
||||
double (*f_fpf)(double, double*);
|
||||
double (*f_fpI)(double, int*);
|
||||
double (*f_ffpI)(double, double, int*);
|
||||
double (*f_fff)(double, double, double );
|
||||
float (*f_fma)(float, float, float, int);
|
||||
}fptr;
|
||||
|
||||
typedef union dptr
|
||||
{
|
||||
void *p;
|
||||
long double (*f_f)(long double);
|
||||
long double (*f_u)(cl_ulong);
|
||||
int (*i_f)(long double);
|
||||
long double (*f_ff)(long double, long double);
|
||||
int (*i_ff)(long double, long double);
|
||||
long double (*f_fi)(long double, int);
|
||||
long double (*f_fpf)(long double, long double*);
|
||||
long double (*f_fpI)(long double, int*);
|
||||
long double (*f_ffpI)(long double, long double, int*);
|
||||
long double (*f_fff)(long double, long double, long double);
|
||||
}dptr;
|
||||
|
||||
struct Func;
|
||||
|
||||
typedef struct vtbl
|
||||
{
|
||||
const char *type_name;
|
||||
int (*TestFunc)( const struct Func *, MTdata );
|
||||
int (*DoubleTestFunc)( const struct Func *, MTdata); // may be NULL if function is single precision only
|
||||
}vtbl;
|
||||
|
||||
typedef struct Func
|
||||
{
|
||||
const char *name; // common name, to be used as an argument in the shell
|
||||
const char *nameInCode; // name as it appears in the __kernel, usually the same as name, but different for multiplication
|
||||
fptr func;
|
||||
dptr dfunc;
|
||||
fptr rfunc;
|
||||
float float_ulps;
|
||||
float double_ulps;
|
||||
float float_embedded_ulps;
|
||||
float relaxed_error;
|
||||
int ftz;
|
||||
int relaxed;
|
||||
const ::vtbl *vtbl;
|
||||
}Func;
|
||||
|
||||
|
||||
extern const Func functionList[];
|
||||
|
||||
extern const size_t functionListCount;
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
36
test_conformance/math_brute_force/Jamfile
Normal file
36
test_conformance/math_brute_force/Jamfile
Normal file
@@ -0,0 +1,36 @@
|
||||
project
|
||||
: requirements
|
||||
-<library>/harness//harness <use>/harness//harness
|
||||
<library>/Runtime//OpenCL.lib
|
||||
# <toolset>gcc:<cflags>-xc++
|
||||
<toolset>msvc:<cflags>"/TP"
|
||||
;
|
||||
|
||||
exe bruteforce
|
||||
: binary.c
|
||||
binary_i.c
|
||||
binaryOperator.c
|
||||
binary_two_results_i.c
|
||||
FunctionList.c
|
||||
i_unary.c
|
||||
macro_binary.c
|
||||
macro_unary.c
|
||||
mad.c
|
||||
main.c
|
||||
reference_math.c
|
||||
Sleep.c
|
||||
ternary.c
|
||||
unary.c
|
||||
unary_two_results.c
|
||||
unary_two_results_i.c
|
||||
unary_u.c
|
||||
Utility.c
|
||||
/harness//mt19937.c
|
||||
: <target-os>windows:<source>/harness//msvc9.c
|
||||
;
|
||||
|
||||
install dist
|
||||
: bruteforce
|
||||
: <variant>debug:<location>$(DIST)/debug/tests/test_conformance/math_brute_force
|
||||
<variant>release:<location>$(DIST)/release/tests/test_conformance/math_brute_force
|
||||
;
|
||||
33
test_conformance/math_brute_force/Makefile
Normal file
33
test_conformance/math_brute_force/Makefile
Normal file
@@ -0,0 +1,33 @@
|
||||
ifdef BUILD_WITH_ATF
|
||||
ATF = -framework ATF
|
||||
USE_ATF = -DUSE_ATF
|
||||
endif
|
||||
|
||||
CC = cc
|
||||
CFLAGS = -g -Wall -Wshorten-64-to-32 $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF}
|
||||
LIBRARIES = -framework OpenCL -framework ApplicationServices -framework IOKit -I/System/Library/Frameworks/OpenCL.framework/Headers ${RC_CFLAGS} ${ATF}
|
||||
|
||||
release:
|
||||
echo "Build Release"
|
||||
$(CC) *.c ../../test_common/harness/mt19937.c ../../test_common/harness/rounding_mode.c ../../test_common/harness/ThreadPool.c -Os $(CFLAGS) -o bruteforce $(LIBRARIES)
|
||||
|
||||
debug:
|
||||
echo "Build Debug"
|
||||
$(CC) *.c ../../test_common/harness/mt19937.c ../../test_common/harness/rounding_mode.c ../../test_common/harness/ThreadPool.c -O0 $(CFLAGS) -D_DEBUG=1 -o bruteforce_debug $(LIBRARIES)
|
||||
|
||||
test: release
|
||||
arch -i386 ./bruteforce -c > cpu.log &
|
||||
arch -i386 ./bruteforce -g > gpu.log &
|
||||
echo "Testing 32-bit mode in progress. This may take up to 1 day to complete. See cpu.log and gpu.log for results."
|
||||
|
||||
test64: release
|
||||
arch -x86_64 ./bruteforce -c > cpu64.log &
|
||||
arch -x86_64 ./bruteforce -g > gpu64.log &
|
||||
echo "Testing 64-bit mode in progress. This may take up to 1 day to complete. See cpu64.log and gpu64.log for results."
|
||||
|
||||
|
||||
clean:
|
||||
rm -f ./bruteforce_debug
|
||||
rm -f ./bruteforce
|
||||
|
||||
all: release
|
||||
150
test_conformance/math_brute_force/README.txt
Normal file
150
test_conformance/math_brute_force/README.txt
Normal file
@@ -0,0 +1,150 @@
|
||||
Copyright: (c) 2009-2013 by Apple Inc. All Rights Reserved.
|
||||
|
||||
math_brute_force test Feb 24, 2009
|
||||
=====================
|
||||
|
||||
Usage:
|
||||
|
||||
Please run the executable with --help for usage information.
|
||||
|
||||
|
||||
|
||||
System Requirements:
|
||||
|
||||
This test requires support for correctly rounded single and double precision arithmetic.
|
||||
The current version also requires a reasonably accurate operating system math library to
|
||||
be present. The OpenCL implementation must be able to compile kernels online. The test assumes
|
||||
that the host system stores its floating point data according to the IEEE-754 binary single and
|
||||
double precision floating point formats.
|
||||
|
||||
|
||||
Test Completion Time:
|
||||
|
||||
This test takes a while. Modern desktop systems can usually finish it in 1-3
|
||||
days. Engineers doing OpenCL math library software development may find wimpy mode (-w)
|
||||
a useful screen to quickly look for problems in a new implementation, before committing
|
||||
to a lengthy test run. Likewise, it is possible to run just a range of tests, or specific
|
||||
tests. See Usage above.
|
||||
|
||||
|
||||
Test Design:
|
||||
|
||||
This test is designed to do a somewhat exhaustive examination of the single
|
||||
and double precision math library functions in OpenCL, for all vector lengths. Math
|
||||
library functions are compared against results from a higher precision reference
|
||||
function to determine correctness. All possible inputs are examined for unary
|
||||
single precision functions. Other functions are tested against a table of difficult
|
||||
values, followed by a few billion random values. If an error is found in a function,
|
||||
the test for that function terminates early, reports an error, and moves on to the
|
||||
next test, if any.
|
||||
|
||||
The test currently doesn't support half precision math functions covered in section
|
||||
9 of the OpenCL 1.0 specification, but does cover the half_func functions covered in
|
||||
section six. It also doesn't test the native_<funcname> functions, for which any result
|
||||
is conformant.
|
||||
|
||||
For the OpenCL 1.0 time frame, the reference library shall be the operating system
|
||||
math library, as modified by the test itself to conform to the OpenCL specification.
|
||||
That will help ensure that all devices on a particular operating system are returning
|
||||
similar results. Going forward to future OpenCL releases, it is planned to gradually
|
||||
introduce a reference math library directly into the test, so as to reduce inter-
|
||||
platform variance between OpenCL implementations.
|
||||
|
||||
Generally speaking, this test will consider a result correct if it is one of the following:
|
||||
|
||||
1) bitwise identical to the output of the reference function,
|
||||
rounded to the appropriate precision
|
||||
|
||||
2) within the allowed ulp error tolerance of the infinitely precise
|
||||
result (as estimated by the reference function)
|
||||
|
||||
3) If the reference result is a NaN, then any NaN is deemed correct.
|
||||
|
||||
4) if the devices is running in FTZ mode, then the result is also correct
|
||||
if the infinitely precise result (as estimated by the reference
|
||||
function) is subnormal, and the returned result is a zero
|
||||
|
||||
5) if the devices is running in FTZ mode, then we also calculate the
|
||||
estimate of the infinitely precise result with the reference function
|
||||
with subnormal inputs flushed to +- zero. If any of those results
|
||||
are within the error tolerance of the returned result, then it is
|
||||
deemed correct
|
||||
|
||||
6) half_func functions may flush per 4&5 above, even if the device is not
|
||||
in FTZ mode.
|
||||
|
||||
7) Functions are allowed to prematurely overflow to infinity, so long as
|
||||
the estimated infinitely precise result is within the stated ulp
|
||||
error limit of the maximum finite representable value of appropriate
|
||||
sign
|
||||
|
||||
8) Functions are allowed to prematurely underflow (and if in FTZ mode,
|
||||
have behavior covered by 4&5 above), so long as the estimated
|
||||
infinitely precise result is within the stated ulp error limit
|
||||
of the minimum normal representable value of appropriate sign
|
||||
|
||||
9) Some functions have limited range. Results of inputs outside that range
|
||||
are considered correct, so long as a result is returned.
|
||||
|
||||
10) Some functions have infinite error bounds. Results of these function
|
||||
are considered correct, so long as a result is returned.
|
||||
|
||||
11) The test currently does not discriminate based on the sign of zero
|
||||
We anticipate a later test will.
|
||||
|
||||
12) The test currently does not check to make sure that edge cases called
|
||||
out in the standard (e.g. pow(1.0, any) = 1.0) are exactly correct.
|
||||
We anticipate a later test will.
|
||||
|
||||
13) The test doesn't check IEEE flags or exceptions. See section 7.3 of the
|
||||
OpenCL standard.
|
||||
|
||||
|
||||
|
||||
Performance Measurement:
|
||||
|
||||
There is also some optional timing code available, currently turned off by default.
|
||||
These may be useful for tracking internal performance regressions, but is not required to
|
||||
be part of the conformance submission.
|
||||
|
||||
|
||||
If the test is believed to be in error:
|
||||
|
||||
The above correctness heuristics shall not be construed to be an alternative to the correctness
|
||||
criteria established by the OpenCL standard. An implementation shall be judged correct
|
||||
or not on appeal based on whether it is within prescribed error bounds of the infinitely
|
||||
precise result. (The ulp is defined in section 7.4 of the spec.) If the input value corresponds
|
||||
to an edge case listed in OpenCL specification sections covering edge case behavior, or
|
||||
similar sections in the C99 TC2 standard (section F.9 and G.6), the the function shall return
|
||||
exactly that result, and the sign of a zero result shall be correct. In the event that the test
|
||||
is found to be faulty, resulting in a spurious failure result, the committee shall make a reasonable
|
||||
attempt to fix the test. If no practical and timely remedy can be found, then the implementation
|
||||
shall be granted a waiver.
|
||||
|
||||
|
||||
Guidelines for reference function error tolerances:
|
||||
|
||||
Errors are measured in ulps, and stored in a single precision representation. So as
|
||||
to avoid introducing error into the error measurement due to error in the reference function
|
||||
itself, the reference function should attempt to deliver 24 bits more precision than the test
|
||||
function return type. (All functions are currently either required to be correctly rounded or
|
||||
may have >= 1 ulp of error. This places the 1's bit at the LSB of the result, with 23 bits of
|
||||
sub-ulp accuracy. One more bit is required to avoid accrual of extra error due to round-to-
|
||||
nearest behavior. If we start to require sub-ulp precision, then the accuracy requirements
|
||||
for reference functions increase.) Therefore reference functions for single precision should
|
||||
have 24+24=48 bits of accuracy, and reference functions for double precision should ideally
|
||||
have 53+24 = 77 bits of accuracy.
|
||||
|
||||
A double precision system math library function should be sufficient to safely verify a single
|
||||
precision OpenCL math library function. A long double precision math library function may or
|
||||
may not be sufficient to verify a double precision OpenCL math library function, depending on
|
||||
the precision of the long double type. A later version of these tests is expected to replace
|
||||
long double with a head+tail double double representation that can represent sufficient precision,
|
||||
on all platforms that support double.
|
||||
|
||||
|
||||
Revision history:
|
||||
|
||||
Feb 24, 2009 IRO Created README
|
||||
Added some reference functions so the test will run on Windows.
|
||||
|
||||
118
test_conformance/math_brute_force/Sleep.c
Normal file
118
test_conformance/math_brute_force/Sleep.c
Normal file
@@ -0,0 +1,118 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "Sleep.h"
|
||||
#include "Utility.h"
|
||||
|
||||
#if defined( __APPLE__ )
|
||||
#include <IOKit/pwr_mgt/IOPMLib.h>
|
||||
#include <IOKit/IOMessage.h>
|
||||
|
||||
struct
|
||||
{
|
||||
io_connect_t connection;
|
||||
IONotificationPortRef port;
|
||||
io_object_t iterator;
|
||||
}sleepInfo;
|
||||
|
||||
void sleepCallback( void * refcon,
|
||||
io_service_t service,
|
||||
natural_t messageType,
|
||||
void * messageArgument );
|
||||
|
||||
void sleepCallback( void * refcon UNUSED,
|
||||
io_service_t service UNUSED,
|
||||
natural_t messageType,
|
||||
void * messageArgument )
|
||||
{
|
||||
|
||||
IOReturn result;
|
||||
/*
|
||||
service -- The IOService whose state has changed.
|
||||
messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the IOService's family.
|
||||
messageArgument -- An argument for the message, dependent on the messageType.
|
||||
*/
|
||||
switch ( messageType )
|
||||
{
|
||||
case kIOMessageSystemWillSleep:
|
||||
// Handle demand sleep (such as sleep caused by running out of
|
||||
// batteries, closing the lid of a laptop, or selecting
|
||||
// sleep from the Apple menu.
|
||||
IOAllowPowerChange(sleepInfo.connection,(long)messageArgument);
|
||||
vlog( "Hard sleep occurred.\n" );
|
||||
break;
|
||||
case kIOMessageCanSystemSleep:
|
||||
// In this case, the computer has been idle for several minutes
|
||||
// and will sleep soon so you must either allow or cancel
|
||||
// this notification. Important: if you don’t respond, there will
|
||||
// be a 30-second timeout before the computer sleeps.
|
||||
// IOCancelPowerChange(root_port,(long)messageArgument);
|
||||
result = IOCancelPowerChange(sleepInfo.connection,(long)messageArgument);
|
||||
if( kIOReturnSuccess != result )
|
||||
vlog( "sleep prevention failed. (%d)\n", result);
|
||||
break;
|
||||
case kIOMessageSystemHasPoweredOn:
|
||||
// Handle wakeup.
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void PreventSleep( void )
|
||||
{
|
||||
#if defined( __APPLE__ )
|
||||
vlog( "Disabling sleep... " );
|
||||
sleepInfo.iterator = (io_object_t) 0;
|
||||
sleepInfo.port = NULL;
|
||||
sleepInfo.connection = IORegisterForSystemPower
|
||||
(
|
||||
&sleepInfo, //void * refcon,
|
||||
&sleepInfo.port, //IONotificationPortRef * thePortRef,
|
||||
sleepCallback, //IOServiceInterestCallback callback,
|
||||
&sleepInfo.iterator //io_object_t * notifier
|
||||
);
|
||||
|
||||
if( (io_connect_t) 0 == sleepInfo.connection )
|
||||
vlog( "failed.\n" );
|
||||
else
|
||||
vlog( "done.\n" );
|
||||
|
||||
CFRunLoopAddSource(CFRunLoopGetCurrent(),
|
||||
IONotificationPortGetRunLoopSource(sleepInfo.port),
|
||||
kCFRunLoopDefaultMode);
|
||||
#else
|
||||
vlog( "*** PreventSleep() is not implemented on this platform.\n" );
|
||||
#endif
|
||||
}
|
||||
|
||||
void ResumeSleep( void )
|
||||
{
|
||||
#if defined( __APPLE__ )
|
||||
IOReturn result = IODeregisterForSystemPower ( &sleepInfo.iterator );
|
||||
if( 0 != result )
|
||||
vlog( "Got error %d restoring sleep \n", result );
|
||||
else
|
||||
vlog( "Sleep restored.\n" );
|
||||
#else
|
||||
vlog( "*** ResumeSleep() is not implemented on this platform.\n" );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
24
test_conformance/math_brute_force/Sleep.h
Normal file
24
test_conformance/math_brute_force/Sleep.h
Normal file
@@ -0,0 +1,24 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef SLEEP_H
|
||||
#define SLEEP_H
|
||||
|
||||
void PreventSleep( void );
|
||||
void ResumeSleep( void );
|
||||
|
||||
#endif /* SLEEP_H */
|
||||
|
||||
|
||||
169
test_conformance/math_brute_force/Utility.c
Normal file
169
test_conformance/math_brute_force/Utility.c
Normal file
@@ -0,0 +1,169 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "Utility.h"
|
||||
|
||||
#if defined(__PPC__)
|
||||
// Global varaiable used to hold the FPU control register state. The FPSCR register can not
|
||||
// be used because not all Power implementations retain or observed the NI (non-IEEE
|
||||
// mode) bit.
|
||||
__thread fpu_control_t fpu_control = 0;
|
||||
#endif
|
||||
|
||||
void MulD(double *rhi, double *rlo, double u, double v)
|
||||
{
|
||||
const double c = 134217729.0; // 1+2^27
|
||||
double up, u1, u2, vp, v1, v2;
|
||||
|
||||
up = u*c;
|
||||
u1 = (u - up) + up;
|
||||
u2 = u - u1;
|
||||
|
||||
vp = v*c;
|
||||
v1 = (v - vp) + vp;
|
||||
v2 = v - v1;
|
||||
|
||||
double rh = u*v;
|
||||
double rl = (((u1*v1 - rh) + (u1*v2)) + (u2*v1)) + (u2*v2);
|
||||
|
||||
*rhi = rh;
|
||||
*rlo = rl;
|
||||
}
|
||||
|
||||
void AddD(double *rhi, double *rlo, double a, double b)
|
||||
{
|
||||
double zhi, zlo;
|
||||
zhi = a + b;
|
||||
if(fabs(a) > fabs(b)) {
|
||||
zlo = zhi - a;
|
||||
zlo = b - zlo;
|
||||
}
|
||||
else {
|
||||
zlo = zhi - b;
|
||||
zlo = a - zlo;
|
||||
}
|
||||
|
||||
*rhi = zhi;
|
||||
*rlo = zlo;
|
||||
}
|
||||
|
||||
void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
|
||||
{
|
||||
double mh, ml;
|
||||
double c = 134217729.0;
|
||||
double up, u1, u2, vp, v1, v2;
|
||||
|
||||
up = xh*c;
|
||||
u1 = (xh - up) + up;
|
||||
u2 = xh - u1;
|
||||
|
||||
vp = yh*c;
|
||||
v1 = (yh - vp) + vp;
|
||||
v2 = yh - v1;
|
||||
|
||||
mh = xh*yh;
|
||||
ml = (((u1*v1 - mh) + (u1*v2)) + (u2*v1)) + (u2*v2);
|
||||
ml += xh*yl + xl*yh;
|
||||
|
||||
*rhi = mh + ml;
|
||||
*rlo = (mh - (*rhi)) + ml;
|
||||
}
|
||||
|
||||
void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
|
||||
{
|
||||
double r, s;
|
||||
r = xh + yh;
|
||||
s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) : (yh - r + xh + xl + yl);
|
||||
*rhi = r + s;
|
||||
*rlo = (r - (*rhi)) + s;
|
||||
}
|
||||
|
||||
void DivideDD(double *chi, double *clo, double a, double b)
|
||||
{
|
||||
*chi = a / b;
|
||||
double rhi, rlo;
|
||||
MulD(&rhi, &rlo, *chi, b);
|
||||
AddDD(&rhi, &rlo, -rhi, -rlo, a, 0.0);
|
||||
*clo = rhi / b;
|
||||
}
|
||||
|
||||
// These functions comapre two floats/doubles. Since some platforms may choose to
|
||||
// flush denormals to zeros before comparison, comparison like a < b may give wrong
|
||||
// result in "certain cases" where we do need correct compasion result when operands
|
||||
// are denormals .... these functions comapre floats/doubles using signed integer/long int
|
||||
// rep. In other cases, when flushing to zeros is fine, these should not be used.
|
||||
// Also these doesn't check for nans and assume nans are handled separately as special edge case
|
||||
// by the caller which calls these functions
|
||||
// return 0 if both are equal, 1 if x > y and -1 if x < y.
|
||||
|
||||
inline
|
||||
int compareFloats(float x, float y)
|
||||
{
|
||||
int32f_t a, b;
|
||||
|
||||
a.f = x;
|
||||
b.f = y;
|
||||
|
||||
if( a.i & 0x80000000 )
|
||||
a.i = 0x80000000 - a.i;
|
||||
if( b.i & 0x80000000 )
|
||||
b.i = 0x80000000 - b.i;
|
||||
|
||||
if( a.i == b.i )
|
||||
return 0;
|
||||
|
||||
return a.i < b.i ? -1 : 1;
|
||||
}
|
||||
|
||||
inline
|
||||
int compareDoubles(double x, double y)
|
||||
{
|
||||
int64d_t a, b;
|
||||
|
||||
a.d = x;
|
||||
b.d = y;
|
||||
|
||||
if( a.l & 0x8000000000000000LL )
|
||||
a.l = 0x8000000000000000LL - a.l;
|
||||
if( b.l & 0x8000000000000000LL )
|
||||
b.l = 0x8000000000000000LL - b.l;
|
||||
|
||||
if( a.l == b.l )
|
||||
return 0;
|
||||
|
||||
return a.l < b.l ? -1 : 1;
|
||||
}
|
||||
|
||||
void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed)
|
||||
{
|
||||
char const *fpSizeStr = NULL;
|
||||
char const *fpFastRelaxedStr = "";
|
||||
switch (float_size) {
|
||||
case sizeof(cl_double):
|
||||
fpSizeStr = "fp64";
|
||||
break;
|
||||
case sizeof(cl_float):
|
||||
fpSizeStr = "fp32";
|
||||
break;
|
||||
case sizeof(cl_half):
|
||||
fpSizeStr = "fp16";
|
||||
break;
|
||||
}
|
||||
if (isFastRelaxed) {
|
||||
fpFastRelaxedStr = "rlx";
|
||||
}
|
||||
vlog("%15s %4s %4s",fname, fpSizeStr, fpFastRelaxedStr);
|
||||
}
|
||||
|
||||
280
test_conformance/math_brute_force/Utility.h
Normal file
280
test_conformance/math_brute_force/Utility.h
Normal file
@@ -0,0 +1,280 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef UTILITY_H
|
||||
#define UTILITY_H
|
||||
|
||||
#include "../../test_common/harness/compat.h"
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include "../../test_common/harness/rounding_mode.h"
|
||||
#include "../../test_common/harness/fpcontrol.h"
|
||||
|
||||
#if defined( _WIN32) && defined (_MSC_VER)
|
||||
#include "../../test_common/harness/testHarness.h"
|
||||
#endif
|
||||
|
||||
#include "../../test_common/harness/ThreadPool.h"
|
||||
#define BUFFER_SIZE (1024*1024*2)
|
||||
|
||||
#if defined( __GNUC__ )
|
||||
#define UNUSED __attribute__ ((unused))
|
||||
#else
|
||||
#define UNUSED
|
||||
#endif
|
||||
|
||||
extern int gWimpyBufferSize;
|
||||
extern int gWimpyReductionFactor;
|
||||
|
||||
#define VECTOR_SIZE_COUNT 6
|
||||
extern const char *sizeNames[VECTOR_SIZE_COUNT];
|
||||
extern const int sizeValues[VECTOR_SIZE_COUNT];
|
||||
|
||||
extern cl_device_type gDeviceType;
|
||||
extern cl_device_id gDevice;
|
||||
extern cl_context gContext;
|
||||
extern cl_command_queue gQueue;
|
||||
extern void *gIn;
|
||||
extern void *gIn2;
|
||||
extern void *gIn3;
|
||||
extern void *gOut_Ref;
|
||||
extern void *gOut_Ref2;
|
||||
extern void *gOut[VECTOR_SIZE_COUNT];
|
||||
extern void *gOut2[VECTOR_SIZE_COUNT];
|
||||
extern cl_mem gInBuffer;
|
||||
extern cl_mem gInBuffer2;
|
||||
extern cl_mem gInBuffer3;
|
||||
extern cl_mem gOutBuffer[VECTOR_SIZE_COUNT];
|
||||
extern cl_mem gOutBuffer2[VECTOR_SIZE_COUNT];
|
||||
extern uint32_t gComputeDevices;
|
||||
extern uint32_t gSimdSize;
|
||||
extern int gSkipCorrectnessTesting;
|
||||
extern int gMeasureTimes;
|
||||
extern int gReportAverageTimes;
|
||||
extern int gForceFTZ;
|
||||
extern volatile int gTestFastRelaxed;
|
||||
extern int gFastRelaxedDerived;
|
||||
extern int gWimpyMode;
|
||||
extern int gHasDouble;
|
||||
extern int gIsInRTZMode;
|
||||
extern int gInfNanSupport;
|
||||
extern int gIsEmbedded;
|
||||
extern int gVerboseBruteForce;
|
||||
extern uint32_t gMaxVectorSizeIndex;
|
||||
extern uint32_t gMinVectorSizeIndex;
|
||||
extern uint32_t gDeviceFrequency;
|
||||
extern cl_device_fp_config gFloatCapabilities;
|
||||
extern cl_device_fp_config gDoubleCapabilities;
|
||||
|
||||
#define LOWER_IS_BETTER 0
|
||||
#define HIGHER_IS_BETTER 1
|
||||
|
||||
#if USE_ATF
|
||||
|
||||
#include <ATF/ATF.h>
|
||||
#define test_start() ATFTestStart()
|
||||
#define test_finish() ATFTestFinish()
|
||||
#define vlog( ... ) ATFLogInfo(__VA_ARGS__)
|
||||
#define vlog_error( ... ) ATFLogError(__VA_ARGS__)
|
||||
#define vlog_perf( _number, _higherIsBetter, _units, _nameFmt, ... ) ATFLogPerformanceNumber(_number, _higherIsBetter, _units, _nameFmt, __VA_ARGS__ )
|
||||
|
||||
#else
|
||||
#include "../../test_common/harness/errorHelpers.h"
|
||||
#endif
|
||||
|
||||
#if defined (_MSC_VER )
|
||||
//Deal with missing scalbn on windows
|
||||
#define scalbnf( _a, _i ) ldexpf( _a, _i )
|
||||
#define scalbn( _a, _i ) ldexp( _a, _i )
|
||||
#define scalbnl( _a, _i ) ldexpl( _a, _i )
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
float Abs_Error( float test, double reference );
|
||||
float Ulp_Error( float test, double reference );
|
||||
//float Ulp_Error_Half( float test, double reference );
|
||||
float Ulp_Error_Double( double test, long double reference );
|
||||
#ifdef __cplusplus
|
||||
} //extern "C"
|
||||
#endif
|
||||
|
||||
uint64_t GetTime( void );
|
||||
double SubtractTime( uint64_t endTime, uint64_t startTime );
|
||||
int MakeKernel( const char **c, cl_uint count, const char *name, cl_kernel *k, cl_program *p );
|
||||
int MakeKernels( const char **c, cl_uint count, const char *name, cl_uint kernel_count, cl_kernel *k, cl_program *p );
|
||||
|
||||
// used to convert a bucket of bits into a search pattern through double
|
||||
static inline double DoubleFromUInt32( uint32_t bits );
|
||||
static inline double DoubleFromUInt32( uint32_t bits )
|
||||
{
|
||||
union{ uint64_t u; double d;} u;
|
||||
|
||||
// split 0x89abcdef to 0x89abc00000000def
|
||||
u.u = bits & 0xfffU;
|
||||
u.u |= (uint64_t) (bits & ~0xfffU) << 32;
|
||||
|
||||
// sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s
|
||||
u.u -= (bits & 0x800U) << 1;
|
||||
|
||||
// return result
|
||||
return u.d;
|
||||
}
|
||||
|
||||
void _LogBuildError( cl_program p, int line, const char *file );
|
||||
#define LogBuildError( program ) _LogBuildError( program, __LINE__, __FILE__ )
|
||||
|
||||
#define PERF_LOOP_COUNT 100
|
||||
|
||||
// Note: though this takes a double, this is for use with single precision tests
|
||||
static inline int IsFloatSubnormal( double x )
|
||||
{
|
||||
#if 2 == FLT_RADIX
|
||||
// Do this in integer to avoid problems with FTZ behavior
|
||||
union{ float d; uint32_t u;}u;
|
||||
u.d = fabsf((float)x);
|
||||
return (u.u-1) < 0x007fffffU;
|
||||
#else
|
||||
// rely on floating point hardware for non-radix2 non-IEEE-754 hardware -- will fail if you flush subnormals to zero
|
||||
return fabs(x) < (double) FLT_MIN && x != 0.0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static inline int IsDoubleSubnormal( long double x )
|
||||
{
|
||||
#if 2 == FLT_RADIX
|
||||
// Do this in integer to avoid problems with FTZ behavior
|
||||
union{ double d; uint64_t u;}u;
|
||||
u.d = fabs((double) x);
|
||||
return (u.u-1) < 0x000fffffffffffffULL;
|
||||
#else
|
||||
// rely on floating point hardware for non-radix2 non-IEEE-754 hardware -- will fail if you flush subnormals to zero
|
||||
return fabs(x) < (double) DBL_MIN && x != 0.0;
|
||||
#endif
|
||||
}
|
||||
|
||||
//The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero.
|
||||
// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero.
|
||||
static inline int IsFloatResultSubnormal( double x, float ulps )
|
||||
{
|
||||
x = fabs(x) - MAKE_HEX_DOUBLE( 0x1.0p-149, 0x1, -149) * (double) ulps;
|
||||
return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
|
||||
}
|
||||
|
||||
static inline int IsFloatResultSubnormalAbsError( double x , float abs_err)
|
||||
{
|
||||
x = x - abs_err;
|
||||
return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
|
||||
}
|
||||
|
||||
static inline int IsDoubleResultSubnormal( long double x, float ulps )
|
||||
{
|
||||
x = fabsl(x) - MAKE_HEX_LONG( 0x1.0p-1074, 0x1, -1074) * (long double) ulps;
|
||||
return x < MAKE_HEX_LONG( 0x1.0p-1022, 0x1, -1022 );
|
||||
}
|
||||
|
||||
static inline int IsFloatInfinity(double x)
|
||||
{
|
||||
union { cl_float d; cl_uint u; } u;
|
||||
u.d = (cl_float) x;
|
||||
return ((u.u & 0x7fffffffU) == 0x7F800000U);
|
||||
}
|
||||
|
||||
static inline int IsFloatMaxFloat(double x)
|
||||
{
|
||||
union { cl_float d; cl_uint u; } u;
|
||||
u.d = (cl_float) x;
|
||||
return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
|
||||
}
|
||||
|
||||
static inline int IsFloatNaN(double x)
|
||||
{
|
||||
union { cl_float d; cl_uint u; } u;
|
||||
u.d = (cl_float) x;
|
||||
return ((u.u & 0x7fffffffU) > 0x7F800000U);
|
||||
}
|
||||
|
||||
extern cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
|
||||
|
||||
// Windows (since long double got deprecated) sets the x87 to 53-bit precision
|
||||
// (that's x87 default state). This causes problems with the tests that
|
||||
// convert long and ulong to float and double or otherwise deal with values
|
||||
// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
|
||||
static inline void Force64BitFPUPrecision(void)
|
||||
{
|
||||
#if __MINGW32__
|
||||
// The usual method is to use _controlfp as follows:
|
||||
// #include <float.h>
|
||||
// _controlfp(_PC_64, _MCW_PC);
|
||||
//
|
||||
// _controlfp is available on MinGW32 but not on MinGW64. Instead of having
|
||||
// divergent code just use inline assembly which works for both.
|
||||
unsigned short int orig_cw = 0;
|
||||
unsigned short int new_cw = 0;
|
||||
__asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
|
||||
new_cw = orig_cw | 0x0300; // set precision to 64-bit
|
||||
__asm__ __volatile__ ("fldcw %0"::"m" (new_cw));
|
||||
#elif defined( _WIN32 ) && defined( __INTEL_COMPILER )
|
||||
// Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* work on win.x64:
|
||||
// > On the x64 architecture, changing the floating point precision is not supported.
|
||||
// (Taken from http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
|
||||
int cw;
|
||||
__asm { fnstcw cw }; // Get current value of FPU control word.
|
||||
cw = cw & 0xfffffcff | ( 3 << 8 ); // Set Precision Control to Double Extended Precision.
|
||||
__asm { fldcw cw }; // Set new value of FPU control word.
|
||||
#else
|
||||
/* Implement for other platforms if needed */
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
#else
|
||||
extern
|
||||
#endif
|
||||
void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
|
||||
|
||||
typedef union
|
||||
{
|
||||
int32_t i;
|
||||
float f;
|
||||
}int32f_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
int64_t l;
|
||||
double d;
|
||||
}int64d_t;
|
||||
|
||||
void MulD(double *rhi, double *rlo, double u, double v);
|
||||
void AddD(double *rhi, double *rlo, double a, double b);
|
||||
void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
|
||||
void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
|
||||
void DivideDD(double *chi, double *clo, double a, double b);
|
||||
int compareFloats(float x, float y);
|
||||
int compareDoubles(double x, double y);
|
||||
|
||||
void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed);
|
||||
|
||||
#endif /* UTILITY_H */
|
||||
|
||||
|
||||
1537
test_conformance/math_brute_force/binary.c
Normal file
1537
test_conformance/math_brute_force/binary.c
Normal file
File diff suppressed because it is too large
Load Diff
1455
test_conformance/math_brute_force/binaryOperator.c
Normal file
1455
test_conformance/math_brute_force/binaryOperator.c
Normal file
File diff suppressed because it is too large
Load Diff
1214
test_conformance/math_brute_force/binary_i.c
Normal file
1214
test_conformance/math_brute_force/binary_i.c
Normal file
File diff suppressed because it is too large
Load Diff
1133
test_conformance/math_brute_force/binary_two_results_i.c
Normal file
1133
test_conformance/math_brute_force/binary_two_results_i.c
Normal file
File diff suppressed because it is too large
Load Diff
629
test_conformance/math_brute_force/i_unary.c
Normal file
629
test_conformance/math_brute_force/i_unary.c
Normal file
@@ -0,0 +1,629 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "Utility.h"
|
||||
|
||||
#include <string.h>
|
||||
#include "FunctionList.h"
|
||||
|
||||
int TestFunc_Int_Float(const Func *f, MTdata);
|
||||
int TestFunc_Int_Double(const Func *f, MTdata);
|
||||
|
||||
#if defined( __cplusplus)
|
||||
extern "C"
|
||||
#endif
|
||||
const vtbl _i_unary = { "i_unary", TestFunc_Int_Float, TestFunc_Int_Double };
|
||||
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" vstore3( i0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" f0 = (float3)( in[3*i], NAN, NAN ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = i0.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = i0.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
}
|
||||
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char *c3[] = {"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" vstore3( i0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" double3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" f0 = (double3)( in[3*i], NAN, NAN ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = i0.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = i0.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
}
|
||||
|
||||
typedef struct BuildKernelInfo
|
||||
{
|
||||
cl_uint offset; // the first vector size to build
|
||||
cl_kernel *kernels;
|
||||
cl_program *programs;
|
||||
const char *nameInCode;
|
||||
}BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
int TestFunc_Int_Float(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
int error;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ;
|
||||
size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
|
||||
uint64_t step = bufferSize / sizeof( float );
|
||||
int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
|
||||
// This test is not using ThreadPool so we need to disable FTZ here
|
||||
// for reference computations
|
||||
FPU_mode_type oldMode;
|
||||
DisableFTZ(&oldMode);
|
||||
|
||||
Force64BitFPUPrecision();
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
if( gWimpyMode )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = (uint32_t) i + j * scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = (uint32_t) i + j;
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
int *r = (int *)gOut_Ref;
|
||||
float *s = (float *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
r[j] = f->func.i_f( s[j] );
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
|
||||
//Verify data
|
||||
uint32_t *t = (uint32_t *)gOut_Ref;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint32_t *q = (uint32_t *)(gOut[k]);
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] )
|
||||
{
|
||||
if( ftz && IsFloatSubnormal(s[j]))
|
||||
{
|
||||
unsigned int correct0 = f->func.i_f( 0.0 );
|
||||
unsigned int correct1 = f->func.i_f( -0.0 );
|
||||
if( q[j] == correct0 || q[j] == correct1 )
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %s%s: %d ulp error at %a (0x%8.8x): *%d vs. %d\n", f->name, sizeNames[k], err, ((float*) gIn)[j], ((cl_uint*) gIn)[j], t[j], q[j] );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
}
|
||||
}
|
||||
|
||||
vlog( "\n" );
|
||||
exit:
|
||||
RestoreFPState(&oldMode);
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
int TestFunc_Int_Double(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
int error;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
int ftz = f->ftz || gForceFTZ;
|
||||
size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
|
||||
uint64_t step = bufferSize / sizeof( cl_double );
|
||||
int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
// This test is not using ThreadPool so we need to disable FTZ here
|
||||
// for reference computations
|
||||
FPU_mode_type oldMode;
|
||||
DisableFTZ(&oldMode);
|
||||
|
||||
Force64BitFPUPrecision();
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info ) ))
|
||||
{
|
||||
return error;
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
double *p = (double *)gIn;
|
||||
if( gWimpyMode )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
p[j] = DoubleFromUInt32( (uint32_t) i + j * scale );
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
p[j] = DoubleFromUInt32( (uint32_t) i + j );
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
int *r = (int *)gOut_Ref;
|
||||
double *s = (double *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
r[j] = f->dfunc.i_f( s[j] );
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
|
||||
//Verify data
|
||||
uint32_t *t = (uint32_t *)gOut_Ref;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint32_t *q = (uint32_t *)(gOut[k]);
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] )
|
||||
{
|
||||
if( ftz && IsDoubleSubnormal(s[j]))
|
||||
{
|
||||
unsigned int correct0 = f->dfunc.i_f( 0.0 );
|
||||
unsigned int correct1 = f->dfunc.i_f( -0.0 );
|
||||
if( q[j] == correct0 || q[j] == correct1 )
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", f->name, sizeNames[k], err, ((double*) gIn)[j], t[j], q[j] );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
double *p = (double *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
p[j] = DoubleFromUInt32( genrand_int32(d) );
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
}
|
||||
|
||||
vlog( "\n" );
|
||||
|
||||
|
||||
exit:
|
||||
RestoreFPState(&oldMode);
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
1215
test_conformance/math_brute_force/macro_binary.c
Normal file
1215
test_conformance/math_brute_force/macro_binary.c
Normal file
File diff suppressed because it is too large
Load Diff
965
test_conformance/math_brute_force/macro_unary.c
Normal file
965
test_conformance/math_brute_force/macro_unary.c
Normal file
@@ -0,0 +1,965 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "Utility.h"
|
||||
|
||||
#include <string.h>
|
||||
#include "FunctionList.h"
|
||||
|
||||
int TestMacro_Int_Float(const Func *f, MTdata);
|
||||
int TestMacro_Int_Double(const Func *f, MTdata);
|
||||
|
||||
#if defined( __cplusplus)
|
||||
extern "C"
|
||||
#endif
|
||||
const vtbl _macro_unary = { "macro_unary", TestMacro_Int_Float, TestMacro_Int_Double };
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 i0 = ", name, "( f0 );\n"
|
||||
" vstore3( i0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" int3 i0;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" i0 = ", name, "( f0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = i0.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = i0.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernels(kern, (cl_uint) kernSize, testName, kernel_count, k, p);
|
||||
}
|
||||
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 d0 = vload3( 0, in + 3 * i );\n"
|
||||
" long3 l0 = ", name, "( d0 );\n"
|
||||
" vstore3( l0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" double3 d0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" d0 = (double3)( in[3*i], NAN, NAN ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" long3 l0 = ", name, "( d0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = l0.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = l0.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernels(kern, (cl_uint) kernSize, testName, kernel_count, k, p);
|
||||
}
|
||||
|
||||
typedef struct BuildKernelInfo
|
||||
{
|
||||
cl_uint offset; // the first vector size to build
|
||||
cl_uint kernel_count;
|
||||
cl_kernel **kernels;
|
||||
cl_program *programs;
|
||||
const char *nameInCode;
|
||||
}BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernel( info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i );
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernelDouble( info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i );
|
||||
}
|
||||
|
||||
//Thread specific data for a worker thread
|
||||
typedef struct ThreadInfo
|
||||
{
|
||||
cl_mem inBuf; // input buffer for the thread
|
||||
cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread
|
||||
cl_command_queue tQueue; // per thread command queue to improve performance
|
||||
}ThreadInfo;
|
||||
|
||||
typedef struct TestInfo
|
||||
{
|
||||
size_t subBufferSize; // Size of the sub-buffer in elements
|
||||
const Func *f; // A pointer to the function info
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes
|
||||
cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id]
|
||||
ThreadInfo *tinfo; // An array of thread specific information for each worker thread
|
||||
cl_uint threadCount; // Number of worker threads
|
||||
cl_uint step; // step between each chunk and the next.
|
||||
cl_uint scale; // stride between individual test values
|
||||
int ftz; // non-zero if running in flush to zero mode
|
||||
|
||||
}TestInfo;
|
||||
|
||||
static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
|
||||
|
||||
int TestMacro_Int_Float(const Func *f, MTdata d)
|
||||
{
|
||||
TestInfo test_info;
|
||||
cl_int error;
|
||||
size_t i, j;
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
|
||||
|
||||
// Init test_info
|
||||
memset( &test_info, 0, sizeof( test_info ) );
|
||||
test_info.threadCount = GetThreadCount();
|
||||
test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.scale = 1;
|
||||
if (gWimpyMode )
|
||||
{
|
||||
test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.scale = (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
|
||||
}
|
||||
test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
|
||||
test_info.f = f;
|
||||
test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
|
||||
// cl_kernels aren't thread safe, so we make one for each vector size for every thread
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
{
|
||||
size_t array_size = test_info.threadCount * sizeof( cl_kernel );
|
||||
test_info.k[i] = (cl_kernel*)malloc( array_size );
|
||||
if( NULL == test_info.k[i] )
|
||||
{
|
||||
vlog_error( "Error: Unable to allocate storage for kernels!\n" );
|
||||
error = CL_OUT_OF_HOST_MEMORY;
|
||||
goto exit;
|
||||
}
|
||||
memset( test_info.k[i], 0, array_size );
|
||||
}
|
||||
test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
if( NULL == test_info.tinfo )
|
||||
{
|
||||
vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
|
||||
error = CL_OUT_OF_HOST_MEMORY;
|
||||
goto exit;
|
||||
}
|
||||
memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
for( i = 0; i < test_info.threadCount; i++ )
|
||||
{
|
||||
cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
|
||||
test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
if( error || NULL == test_info.tinfo[i].inBuf)
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
if( error || NULL == test_info.tinfo[i].outBuf[j] )
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gOutBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
test_info.tinfo[i].tQueue = clCreateCommandQueueWithProperties(gContext, gDevice, 0, &error);
|
||||
if( NULL == test_info.tinfo[i].tQueue || error )
|
||||
{
|
||||
vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Init the kernels
|
||||
{
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( !gSkipCorrectnessTesting )
|
||||
{
|
||||
error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
|
||||
|
||||
if( error )
|
||||
goto exit;
|
||||
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
cl_uint *p = (cl_uint *)gIn;
|
||||
for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
|
||||
size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; // BUFFER_SIZE / vectorSize rounded up
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( i = 0; i < PERF_LOOP_COUNT; i++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
}
|
||||
}
|
||||
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
{
|
||||
clReleaseProgram(test_info.programs[i]);
|
||||
if( test_info.k[i] )
|
||||
{
|
||||
for( j = 0; j < test_info.threadCount; j++ )
|
||||
clReleaseKernel(test_info.k[i][j]);
|
||||
|
||||
free( test_info.k[i] );
|
||||
}
|
||||
}
|
||||
if( test_info.tinfo )
|
||||
{
|
||||
for( i = 0; i < test_info.threadCount; i++ )
|
||||
{
|
||||
clReleaseMemObject(test_info.tinfo[i].inBuf);
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
|
||||
clReleaseCommandQueue(test_info.tinfo[i].tQueue);
|
||||
}
|
||||
|
||||
free( test_info.tinfo );
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
{
|
||||
const TestInfo *job = (const TestInfo *) data;
|
||||
size_t buffer_elements = job->subBufferSize;
|
||||
size_t buffer_size = buffer_elements * sizeof( cl_float );
|
||||
cl_uint scale = job->scale;
|
||||
cl_uint base = job_id * (cl_uint) job->step;
|
||||
ThreadInfo *tinfo = job->tinfo + thread_id;
|
||||
fptr func = job->f->func;
|
||||
int ftz = job->ftz;
|
||||
cl_uint j, k;
|
||||
cl_int error = CL_SUCCESS;
|
||||
const char *name = job->f->name;
|
||||
|
||||
int signbit_test = 0;
|
||||
if(!strcmp(name, "signbit"))
|
||||
signbit_test = 1;
|
||||
|
||||
#define ref_func(s) ( signbit_test ? func.i_f_f( s ) : func.i_f( s ) )
|
||||
|
||||
// start the map of the output arrays
|
||||
cl_event e[ VECTOR_SIZE_COUNT ];
|
||||
cl_int *out[ VECTOR_SIZE_COUNT ];
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
|
||||
if( error || NULL == out[j])
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
// Write the new values to the input array
|
||||
cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
p[j] = base + j * scale;
|
||||
|
||||
if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
//Wait for the map to finish
|
||||
if( (error = clWaitForEvents(1, e + j) ))
|
||||
{
|
||||
vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
if( (error = clReleaseEvent( e[j] ) ))
|
||||
{
|
||||
vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// Fill the result buffer with garbage, so that old results don't carry over
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(out[j], &pattern, buffer_size);
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// run the kernel
|
||||
size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
|
||||
cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel
|
||||
cl_program program = job->programs[j];
|
||||
|
||||
if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
|
||||
if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush 2 failed\n" );
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
return CL_SUCCESS;
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
|
||||
float *s = (float *)p;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
r[j] = ref_func( s[j] );
|
||||
|
||||
// Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
|
||||
for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
if( error || NULL == out[j] )
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
// Wait for the last buffer
|
||||
out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
if( error || NULL == out[j] )
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
|
||||
//Verify data
|
||||
cl_int *t = (cl_int *)r;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
cl_int *q = out[0];
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( gMinVectorSizeIndex == 0 && t[j] != q[j])
|
||||
{
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( ftz )
|
||||
{
|
||||
if( IsFloatSubnormal( s[j]) )
|
||||
{
|
||||
int correct = ref_func( +0.0f );
|
||||
int correct2 = ref_func( -0.0f );
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", name, err, ((float*) s)[j], t[j], q[j] );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
|
||||
for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
q = out[k];
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( -t[j] != q[j] )
|
||||
{
|
||||
if( ftz )
|
||||
{
|
||||
if( IsFloatSubnormal( s[j]))
|
||||
{
|
||||
int correct = -ref_func( +0.0f );
|
||||
int correct2 = -ref_func( -0.0f );
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t err = -t[j] - q[j];
|
||||
if( q[j] > -t[j] )
|
||||
err = q[j] + t[j];
|
||||
vlog_error( "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, sizeNames[k], err, ((float*) s)[j], -t[j], q[j] );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush 3 failed\n" );
|
||||
|
||||
|
||||
if( 0 == ( base & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data );
|
||||
|
||||
int TestMacro_Int_Double(const Func *f, MTdata d)
|
||||
{
|
||||
TestInfo test_info;
|
||||
cl_int error;
|
||||
size_t i, j;
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
|
||||
// Init test_info
|
||||
memset( &test_info, 0, sizeof( test_info ) );
|
||||
test_info.threadCount = GetThreadCount();
|
||||
test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.scale = 1;
|
||||
if (gWimpyMode )
|
||||
{
|
||||
test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
||||
test_info.scale = (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
|
||||
}
|
||||
|
||||
test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
|
||||
test_info.f = f;
|
||||
test_info.ftz = f->ftz || gForceFTZ;
|
||||
|
||||
// cl_kernels aren't thread safe, so we make one for each vector size for every thread
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
{
|
||||
size_t array_size = test_info.threadCount * sizeof( cl_kernel );
|
||||
test_info.k[i] = (cl_kernel*)malloc( array_size );
|
||||
if( NULL == test_info.k[i] )
|
||||
{
|
||||
vlog_error( "Error: Unable to allocate storage for kernels!\n" );
|
||||
error = CL_OUT_OF_HOST_MEMORY;
|
||||
goto exit;
|
||||
}
|
||||
memset( test_info.k[i], 0, array_size );
|
||||
}
|
||||
test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
if( NULL == test_info.tinfo )
|
||||
{
|
||||
vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
|
||||
error = CL_OUT_OF_HOST_MEMORY;
|
||||
goto exit;
|
||||
}
|
||||
memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
|
||||
for( i = 0; i < test_info.threadCount; i++ )
|
||||
{
|
||||
cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
|
||||
test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
if( error || NULL == test_info.tinfo[i].inBuf)
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
/* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
|
||||
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
||||
/* Qualcomm fix: end */
|
||||
if( error || NULL == test_info.tinfo[i].outBuf[j] )
|
||||
{
|
||||
vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
test_info.tinfo[i].tQueue = clCreateCommandQueueWithProperties(gContext, gDevice, 0, &error);
|
||||
if( NULL == test_info.tinfo[i].tQueue || error )
|
||||
{
|
||||
vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Init the kernels
|
||||
{
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if( !gSkipCorrectnessTesting )
|
||||
{
|
||||
error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
|
||||
|
||||
if( error )
|
||||
goto exit;
|
||||
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
cl_ulong *p = (cl_ulong *)gIn;
|
||||
for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
|
||||
p[j] = DoubleFromUInt32(genrand_int32(d));
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( i = 0; i < PERF_LOOP_COUNT; i++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
}
|
||||
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
{
|
||||
clReleaseProgram(test_info.programs[i]);
|
||||
if( test_info.k[i] )
|
||||
{
|
||||
for( j = 0; j < test_info.threadCount; j++ )
|
||||
clReleaseKernel(test_info.k[i][j]);
|
||||
|
||||
free( test_info.k[i] );
|
||||
}
|
||||
}
|
||||
if( test_info.tinfo )
|
||||
{
|
||||
for( i = 0; i < test_info.threadCount; i++ )
|
||||
{
|
||||
clReleaseMemObject(test_info.tinfo[i].inBuf);
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
|
||||
clReleaseCommandQueue(test_info.tinfo[i].tQueue);
|
||||
}
|
||||
|
||||
free( test_info.tinfo );
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
|
||||
{
|
||||
const TestInfo *job = (const TestInfo *) data;
|
||||
size_t buffer_elements = job->subBufferSize;
|
||||
size_t buffer_size = buffer_elements * sizeof( cl_double );
|
||||
cl_uint scale = job->scale;
|
||||
cl_uint base = job_id * (cl_uint) job->step;
|
||||
ThreadInfo *tinfo = job->tinfo + thread_id;
|
||||
dptr dfunc = job->f->dfunc;
|
||||
cl_uint j, k;
|
||||
cl_int error;
|
||||
int ftz = job->ftz;
|
||||
const char *name = job->f->name;
|
||||
|
||||
Force64BitFPUPrecision();
|
||||
|
||||
// start the map of the output arrays
|
||||
cl_event e[ VECTOR_SIZE_COUNT ];
|
||||
cl_long *out[ VECTOR_SIZE_COUNT ];
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
|
||||
if( error || NULL == out[j])
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
// Write the new values to the input array
|
||||
cl_double *p = (cl_double*) gIn + thread_id * buffer_elements;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
p[j] = DoubleFromUInt32( base + j * scale);
|
||||
|
||||
if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
//Wait for the map to finish
|
||||
if( (error = clWaitForEvents(1, e + j) ))
|
||||
{
|
||||
vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
if( (error = clReleaseEvent( e[j] ) ))
|
||||
{
|
||||
vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// Fill the result buffer with garbage, so that old results don't carry over
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(out[j], &pattern, buffer_size);
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// run the kernel
|
||||
size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
|
||||
cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel
|
||||
cl_program program = job->programs[j];
|
||||
|
||||
if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
|
||||
if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush 2 failed\n" );
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
return CL_SUCCESS;
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
|
||||
cl_double *s = (cl_double *)p;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
r[j] = dfunc.i_f( s[j] );
|
||||
|
||||
// Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
|
||||
for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
if( error || NULL == out[j] )
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
// Wait for the last buffer
|
||||
out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
|
||||
if( error || NULL == out[j] )
|
||||
{
|
||||
vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
//Verify data
|
||||
cl_long *t = (cl_long *)r;
|
||||
for( j = 0; j < buffer_elements; j++ )
|
||||
{
|
||||
cl_long *q = out[0];
|
||||
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( gMinVectorSizeIndex == 0 && t[j] != q[j])
|
||||
{
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( ftz )
|
||||
{
|
||||
if( IsDoubleSubnormal( s[j]) )
|
||||
{
|
||||
cl_long correct = dfunc.i_f( +0.0f );
|
||||
cl_long correct2 = dfunc.i_f( -0.0f );
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
cl_ulong err = t[j] - q[j];
|
||||
if( q[j] > t[j] )
|
||||
err = q[j] - t[j];
|
||||
vlog_error( "\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", name, err, ((double*) gIn)[j], t[j], q[j] );
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
q = out[k];
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( -t[j] != q[j] )
|
||||
{
|
||||
if( ftz )
|
||||
{
|
||||
if( IsDoubleSubnormal( s[j]))
|
||||
{
|
||||
int64_t correct = -dfunc.i_f( +0.0f );
|
||||
int64_t correct2 = -dfunc.i_f( -0.0f );
|
||||
if( correct == q[j] || correct2 == q[j] )
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
cl_ulong err = -t[j] - q[j];
|
||||
if( q[j] > -t[j] )
|
||||
err = q[j] + t[j];
|
||||
vlog_error( "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", name, sizeNames[k], err, ((double*) gIn)[j], -t[j], q[j] );
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
if( (error = clFlush(tinfo->tQueue) ))
|
||||
vlog( "clFlush 3 failed\n" );
|
||||
|
||||
|
||||
if( 0 == ( base & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
1131
test_conformance/math_brute_force/mad.c
Normal file
1131
test_conformance/math_brute_force/mad.c
Normal file
File diff suppressed because it is too large
Load Diff
1811
test_conformance/math_brute_force/main.c
Normal file
1811
test_conformance/math_brute_force/main.c
Normal file
File diff suppressed because it is too large
Load Diff
5499
test_conformance/math_brute_force/reference_math.c
Normal file
5499
test_conformance/math_brute_force/reference_math.c
Normal file
File diff suppressed because it is too large
Load Diff
232
test_conformance/math_brute_force/reference_math.h
Normal file
232
test_conformance/math_brute_force/reference_math.h
Normal file
@@ -0,0 +1,232 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef REFERENCE_MATH_H
|
||||
#define REFERENCE_MATH_H
|
||||
|
||||
#if defined( __APPLE__ )
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
// -- for testing float --
|
||||
double reference_sinh( double x );
|
||||
double reference_sqrt( double x );
|
||||
double reference_tanh( double x );
|
||||
double reference_acos( double );
|
||||
double reference_asin( double );
|
||||
double reference_atan( double );
|
||||
double reference_atan2( double, double );
|
||||
double reference_ceil( double );
|
||||
double reference_cosh( double );
|
||||
double reference_exp( double );
|
||||
double reference_fabs( double );
|
||||
double reference_acospi( double );
|
||||
double reference_asinpi( double );
|
||||
double reference_atanpi( double );
|
||||
double reference_atan2pi( double, double );
|
||||
double reference_cospi( double );
|
||||
double reference_divide( double, double );
|
||||
double reference_fract( double, double * );
|
||||
float reference_fma( float, float, float, int );
|
||||
double reference_mad( double, double, double );
|
||||
double reference_nextafter(double, double );
|
||||
double reference_recip( double );
|
||||
double reference_rootn( double, int );
|
||||
double reference_rsqrt( double );
|
||||
double reference_sincos( double, double * );
|
||||
double reference_sinpi( double );
|
||||
double reference_tanpi( double );
|
||||
double reference_pow(double x, double y);
|
||||
double reference_pown( double, int );
|
||||
double reference_powr( double, double );
|
||||
double reference_cos( double );
|
||||
double reference_sin( double );
|
||||
double reference_tan( double );
|
||||
double reference_log( double );
|
||||
double reference_log10( double );
|
||||
double reference_modf( double, double *n );
|
||||
|
||||
double reference_fdim( double, double );
|
||||
double reference_add( double, double );
|
||||
double reference_subtract( double, double );
|
||||
double reference_divide( double, double );
|
||||
double reference_multiply( double, double );
|
||||
double reference_remquo( double, double, int* );
|
||||
double reference_lgamma_r( double, int* );
|
||||
|
||||
int reference_isequal( double, double );
|
||||
int reference_isfinite( double );
|
||||
int reference_isgreater( double, double );
|
||||
int reference_isgreaterequal( double, double );
|
||||
int reference_isinf( double );
|
||||
int reference_isless( double, double );
|
||||
int reference_islessequal( double, double );
|
||||
int reference_islessgreater( double, double );
|
||||
int reference_isnan( double );
|
||||
int reference_isnormal( double );
|
||||
int reference_isnotequal( double, double );
|
||||
int reference_isordered( double, double );
|
||||
int reference_isunordered( double, double );
|
||||
int reference_signbit( float );
|
||||
|
||||
double reference_acosh( double x );
|
||||
double reference_asinh( double x );
|
||||
double reference_atanh( double x );
|
||||
double reference_cbrt(double x);
|
||||
float reference_copysign( float x, float y);
|
||||
double reference_copysignd( double x, double y);
|
||||
double reference_exp10( double );
|
||||
double reference_exp2( double x );
|
||||
double reference_expm1( double x );
|
||||
double reference_fmax( double x, double y );
|
||||
double reference_fmin( double x, double y );
|
||||
double reference_hypot( double x, double y );
|
||||
double reference_lgamma( double x);
|
||||
int reference_ilogb( double );
|
||||
double reference_log2( double x );
|
||||
double reference_log1p( double x );
|
||||
double reference_logb( double x );
|
||||
double reference_maxmag( double x, double y );
|
||||
double reference_minmag( double x, double y );
|
||||
double reference_nan( cl_uint x );
|
||||
double reference_reciprocal( double x );
|
||||
double reference_remainder( double x, double y );
|
||||
double reference_rint( double x );
|
||||
double reference_round( double x );
|
||||
double reference_trunc( double x );
|
||||
double reference_floor( double x );
|
||||
double reference_fmod( double x, double y );
|
||||
double reference_frexp( double x, int *n );
|
||||
double reference_ldexp( double x, int n );
|
||||
|
||||
double reference_assignment( double x );
|
||||
int reference_not( double x );
|
||||
// -- for testing fast-relaxed
|
||||
|
||||
double reference_relaxed_mad( double, double, double );
|
||||
double reference_relaxed_divide( double x, double y );
|
||||
double reference_relaxed_sin( double x );
|
||||
double reference_relaxed_cos( double x );
|
||||
double reference_relaxed_sincos( double x, double * y);
|
||||
double reference_relaxed_tan( double x );
|
||||
double reference_relaxed_exp( double x );
|
||||
double reference_relaxed_exp2( double x );
|
||||
double reference_relaxed_exp10( double x );
|
||||
double reference_relaxed_log( double x );
|
||||
double reference_relaxed_log2( double x );
|
||||
double reference_relaxed_pow( double x, double y);
|
||||
double reference_relaxed_reciprocal( double x );
|
||||
|
||||
// -- for testing double --
|
||||
|
||||
long double reference_sinhl( long double x );
|
||||
long double reference_sqrtl( long double x );
|
||||
long double reference_tanhl( long double x );
|
||||
long double reference_acosl( long double );
|
||||
long double reference_asinl( long double );
|
||||
long double reference_atanl( long double );
|
||||
long double reference_atan2l( long double, long double );
|
||||
long double reference_ceill( long double );
|
||||
long double reference_coshl( long double );
|
||||
long double reference_expl( long double );
|
||||
long double reference_fabsl( long double );
|
||||
long double reference_acospil( long double );
|
||||
long double reference_asinpil( long double );
|
||||
long double reference_atanpil( long double );
|
||||
long double reference_atan2pil( long double, long double );
|
||||
long double reference_cospil( long double );
|
||||
long double reference_dividel( long double, long double );
|
||||
long double reference_fractl( long double, long double * );
|
||||
long double reference_fmal( long double, long double, long double );
|
||||
long double reference_madl( long double, long double, long double );
|
||||
long double reference_nextafterl(long double, long double );
|
||||
long double reference_recipl( long double );
|
||||
long double reference_rootnl( long double, int );
|
||||
long double reference_rsqrtl( long double );
|
||||
long double reference_sincosl( long double, long double * );
|
||||
long double reference_sinpil( long double );
|
||||
long double reference_tanpil( long double );
|
||||
long double reference_powl(long double x, long double y);
|
||||
long double reference_pownl( long double, int );
|
||||
long double reference_powrl( long double, long double );
|
||||
long double reference_cosl( long double );
|
||||
long double reference_sinl(long double );
|
||||
long double reference_tanl( long double );
|
||||
long double reference_logl( long double );
|
||||
long double reference_log10l( long double );
|
||||
long double reference_modfl( long double, long double *n );
|
||||
|
||||
|
||||
long double reference_fdiml( long double, long double );
|
||||
long double reference_addl( long double, long double );
|
||||
long double reference_subtractl( long double, long double );
|
||||
long double reference_dividel( long double, long double );
|
||||
long double reference_multiplyl( long double, long double );
|
||||
long double reference_remquol( long double, long double, int* );
|
||||
long double reference_lgamma_rl( long double, int* );
|
||||
|
||||
|
||||
int reference_isequall( long double, long double );
|
||||
int reference_isfinitel( long double );
|
||||
int reference_isgreaterl( long double, long double );
|
||||
int reference_isgreaterequall( long double, long double );
|
||||
int reference_isinfl( long double );
|
||||
int reference_islessl( long double, long double );
|
||||
int reference_islessequall( long double, long double );
|
||||
int reference_islessgreaterl( long double, long double );
|
||||
int reference_isnanl( long double );
|
||||
int reference_isnormall( long double );
|
||||
int reference_isnotequall( long double, long double );
|
||||
int reference_isorderedl( long double, long double );
|
||||
int reference_isunorderedl( long double, long double );
|
||||
int reference_signbitl( long double );
|
||||
|
||||
long double reference_acoshl( long double x );
|
||||
long double reference_asinhl( long double x );
|
||||
long double reference_atanhl( long double x );
|
||||
long double reference_cbrtl(long double x);
|
||||
long double reference_copysignl( long double x, long double y);
|
||||
long double reference_exp10l( long double );
|
||||
long double reference_exp2l( long double x );
|
||||
long double reference_expm1l( long double x );
|
||||
long double reference_fmaxl( long double x, long double y );
|
||||
long double reference_fminl( long double x, long double y );
|
||||
long double reference_hypotl( long double x, long double y );
|
||||
long double reference_lgammal( long double x);
|
||||
int reference_ilogbl( long double );
|
||||
long double reference_log2l( long double x );
|
||||
long double reference_log1pl( long double x );
|
||||
long double reference_logbl( long double x );
|
||||
long double reference_maxmagl( long double x, long double y );
|
||||
long double reference_minmagl( long double x, long double y );
|
||||
long double reference_nanl( cl_ulong x );
|
||||
long double reference_reciprocall( long double x );
|
||||
long double reference_remainderl( long double x, long double y );
|
||||
long double reference_rintl( long double x );
|
||||
long double reference_roundl( long double x );
|
||||
long double reference_truncl( long double x );
|
||||
long double reference_floorl( long double x );
|
||||
long double reference_fmodl( long double x, long double y );
|
||||
long double reference_frexpl( long double x, int *n );
|
||||
long double reference_ldexpl( long double x, int n );
|
||||
|
||||
long double reference_assignmentl( long double x );
|
||||
int reference_notl( long double x );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
#! /usr/bin/python
|
||||
|
||||
# // OpenCL Conformance Tests
|
||||
# //
|
||||
# // Copyright: (c) 2009-2013 by Apple Inc. All Rights Reserved.
|
||||
# //
|
||||
|
||||
import os, re, sys, subprocess, time
|
||||
|
||||
# A script to run the entierty of math_brute_force, but to run each separate job in parallel.
|
||||
|
||||
def DEBUG(text, level=1):
|
||||
if (DEBUG_LEVEL >= level): print(text)
|
||||
|
||||
def write_info(text):
|
||||
print text,
|
||||
if (ATF):
|
||||
ATF_log.write("<Info>"+text+"</Info>\n")
|
||||
ATF_log.flush()
|
||||
|
||||
def write_error(text):
|
||||
print "ERROR:" + text,
|
||||
if (ATF):
|
||||
ATF_log.write("<Error>"+text+"</Error>\n")
|
||||
ATF_log.flush()
|
||||
|
||||
def start_atf():
|
||||
global ATF, ATF_log
|
||||
DEBUG("start_atf()")
|
||||
if (os.environ.get("ATF_RESULTSDIRECTORY") == None):
|
||||
ATF = False
|
||||
DEBUG("\tATF not defined",0)
|
||||
return
|
||||
ATF = True
|
||||
ATF_output_file_name = "TestLog.xml"
|
||||
output_path = os.environ.get("ATF_RESULTSDIRECTORY")
|
||||
try:
|
||||
ATF_log = open(output_path+ATF_output_file_name, "w")
|
||||
except IOError:
|
||||
DEBUG("Could not open ATF file " + ATF_output_file_name, 0)
|
||||
ATF = False
|
||||
return
|
||||
DEBUG("ATF Enabled")
|
||||
# Generate the XML header
|
||||
ATF_log.write("<Log>\n")
|
||||
ATF_log.write("<TestStart/>\n")
|
||||
DEBUG("Done start_atf()")
|
||||
|
||||
def stop_atf():
|
||||
DEBUG("stop_atf()")
|
||||
if (ATF):
|
||||
ATF.write("<TestFinish/>\n")
|
||||
ATF.write("</Log>\n")
|
||||
ATF.close()
|
||||
|
||||
def get_time() :
|
||||
return time.strftime("%A %H:%M:%S", time.localtime())
|
||||
|
||||
def start_test(id):
|
||||
DEBUG("start_test("+str(id) + ")")
|
||||
command = test + " " + str(id) + " " + str(id)
|
||||
try:
|
||||
write_info(get_time() + " Executing " + command + "...")
|
||||
p = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
|
||||
except OSError:
|
||||
write_error("Failed to execute " + command)
|
||||
return
|
||||
running_tests[id] = p
|
||||
DEBUG("start_test("+str(id) + ") added: " + str(running_tests[id]) + \
|
||||
", now " + str(len(running_tests.keys())) + " tests running")
|
||||
|
||||
|
||||
|
||||
|
||||
DEBUG_LEVEL = 2
|
||||
test = "./bruteforce -w"
|
||||
instances = 4
|
||||
max_test_ID = 12
|
||||
running_tests = {}
|
||||
ATF_log = None
|
||||
ATF = False
|
||||
|
||||
# Start the ATF log
|
||||
start_atf()
|
||||
next_test = 0
|
||||
next_test_to_finish = 0
|
||||
|
||||
while ( (next_test <= max_test_ID) | (next_test_to_finish <= max_test_ID)):
|
||||
# If we want to run more tests, start them
|
||||
while ((len(running_tests.keys()) < instances) & (next_test <= max_test_ID)):
|
||||
start_test(next_test)
|
||||
next_test = next_test + 1
|
||||
time.sleep(1)
|
||||
# Check if the oldest test has finished
|
||||
p = running_tests[next_test_to_finish]
|
||||
if (p.poll() != None):
|
||||
write_info(get_time() + " Test " + str(next_test_to_finish) +" finished.")
|
||||
del running_tests[next_test_to_finish]
|
||||
next_test_to_finish = next_test_to_finish + 1
|
||||
# Write the results from the test out
|
||||
for line in p.stdout.readlines():
|
||||
write_info(line)
|
||||
for line in p.stderr.readlines():
|
||||
write_error(line)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
# Stop the ATF log
|
||||
stop_atf()
|
||||
1361
test_conformance/math_brute_force/ternary.c
Normal file
1361
test_conformance/math_brute_force/ternary.c
Normal file
File diff suppressed because it is too large
Load Diff
1191
test_conformance/math_brute_force/unary.c
Normal file
1191
test_conformance/math_brute_force/unary.c
Normal file
File diff suppressed because it is too large
Load Diff
993
test_conformance/math_brute_force/unary_two_results.c
Normal file
993
test_conformance/math_brute_force/unary_two_results.c
Normal file
@@ -0,0 +1,993 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "Utility.h"
|
||||
|
||||
#include <string.h>
|
||||
#include "FunctionList.h"
|
||||
|
||||
int TestFunc_Float2_Float(const Func *f, MTdata);
|
||||
int TestFunc_Double2_Double(const Func *f, MTdata);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C"
|
||||
#endif
|
||||
const vtbl _unary_two_results = { "unary_two_results", TestFunc_Float2_Float, TestFunc_Double2_Double };
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* out2, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" float3 iout = NAN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" float3 iout = NAN;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" f0 = (float3)( in[3*i], NAN, NAN ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = f0.y; \n"
|
||||
" out2[3*i+1] = iout.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = f0.x; \n"
|
||||
" out2[3*i] = iout.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
|
||||
}
|
||||
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* out2, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" double3 iout = NAN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" double3 iout = NAN;\n"
|
||||
" double3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" f0 = (double3)( in[3*i], NAN, NAN ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = f0.y; \n"
|
||||
" out2[3*i+1] = iout.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = f0.x; \n"
|
||||
" out2[3*i] = iout.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
|
||||
}
|
||||
|
||||
typedef struct BuildKernelInfo
|
||||
{
|
||||
cl_uint offset; // the first vector size to build
|
||||
cl_kernel *kernels;
|
||||
cl_program *programs;
|
||||
const char *nameInCode;
|
||||
}BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
int TestFunc_Float2_Float(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
uint32_t l;
|
||||
int error;
|
||||
char const * testing_mode;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
float maxError0 = 0.0f;
|
||||
float maxError1 = 0.0f;
|
||||
int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
|
||||
float maxErrorVal0 = 0.0f;
|
||||
float maxErrorVal1 = 0.0f;
|
||||
size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
|
||||
uint64_t step = bufferSize / sizeof( float );
|
||||
int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
|
||||
cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
|
||||
int isFract = 0 == strcmp( "fract", f->nameInCode );
|
||||
int skipNanInf = isFract && ! gInfNanSupport;
|
||||
float float_ulps;
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
if( gIsEmbedded )
|
||||
float_ulps = f->float_embedded_ulps;
|
||||
else
|
||||
float_ulps = f->float_ulps;
|
||||
|
||||
if (gTestFastRelaxed)
|
||||
float_ulps = f->relaxed_error;
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
if( gWimpyMode )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
p[j] = (uint32_t) i + j * scale;
|
||||
if ( gTestFastRelaxed && strcmp(f->name,"sincos") == 0 )
|
||||
{
|
||||
float pj = *(float *)&p[j];
|
||||
if(fabs(pj) > M_PI)
|
||||
p[j] = NAN;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
p[j] = (uint32_t) i + j;
|
||||
if ( gTestFastRelaxed && strcmp(f->name,"sincos") == 0 )
|
||||
{
|
||||
float pj = *(float *)&p[j];
|
||||
if(fabs(pj) > M_PI)
|
||||
p[j] = NAN;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
FPU_mode_type oldMode;
|
||||
RoundingMode oldRoundMode = kRoundToNearestEven;
|
||||
if( isFract )
|
||||
{
|
||||
//Calculate the correctly rounded reference result
|
||||
memset( &oldMode, 0, sizeof( oldMode ) );
|
||||
if( ftz )
|
||||
ForceFTZ( &oldMode );
|
||||
|
||||
// Set the rounding mode to match the device
|
||||
if (gIsInRTZMode)
|
||||
oldRoundMode = set_round(kRoundTowardZero, kfloat);
|
||||
}
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
float *r = (float *)gOut_Ref;
|
||||
float *r2 = (float *)gOut_Ref2;
|
||||
float *s = (float *)gIn;
|
||||
|
||||
if( skipNanInf )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
double dd;
|
||||
feclearexcept(FE_OVERFLOW);
|
||||
|
||||
if( gTestFastRelaxed )
|
||||
r[j] = (float) f->rfunc.f_fpf( s[j], &dd );
|
||||
else
|
||||
r[j] = (float) f->func.f_fpf( s[j], &dd );
|
||||
|
||||
r2[j] = (float) dd;
|
||||
overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
double dd;
|
||||
if( gTestFastRelaxed )
|
||||
r[j] = (float) f->rfunc.f_fpf( s[j], &dd );
|
||||
else
|
||||
r[j] = (float) f->func.f_fpf( s[j], &dd );
|
||||
|
||||
r2[j] = (float) dd;
|
||||
}
|
||||
}
|
||||
|
||||
if( isFract && ftz )
|
||||
RestoreFPState( &oldMode );
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray2 failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
{
|
||||
if (isFract && gIsInRTZMode)
|
||||
(void)set_round(oldRoundMode, kfloat);
|
||||
break;
|
||||
}
|
||||
|
||||
//Verify data
|
||||
uint32_t *t = (uint32_t *)gOut_Ref;
|
||||
uint32_t *t2 = (uint32_t *)gOut_Ref2;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint32_t *q = (uint32_t *)gOut[k];
|
||||
uint32_t *q2 = (uint32_t *)gOut2[k];
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] || t2[j] != q2[j] )
|
||||
{
|
||||
double correct, correct2;
|
||||
float err, err2;
|
||||
float test = ((float*) q)[j];
|
||||
float test2 = ((float*) q2)[j];
|
||||
|
||||
if( gTestFastRelaxed )
|
||||
correct = f->rfunc.f_fpf( s[j], &correct2 );
|
||||
else
|
||||
correct = f->func.f_fpf( s[j], &correct2 );
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
|
||||
if (gTestFastRelaxed || skipNanInf)
|
||||
{
|
||||
if (skipNanInf && overflow[j])
|
||||
continue;
|
||||
|
||||
// Note: no double rounding here. Reference functions calculate in single precision.
|
||||
if( IsFloatInfinity(correct) || IsFloatNaN(correct) ||
|
||||
IsFloatInfinity(correct2)|| IsFloatNaN(correct2) ||
|
||||
IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) )
|
||||
continue;
|
||||
}
|
||||
|
||||
typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
|
||||
CheckForSubnormal isFloatResultSubnormalPtr;
|
||||
if( gTestFastRelaxed )
|
||||
{
|
||||
err = Abs_Error( test, correct);
|
||||
err2 = Abs_Error( test2, correct2);
|
||||
isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
|
||||
}
|
||||
else
|
||||
{
|
||||
err = Ulp_Error( test, correct );
|
||||
err2 = Ulp_Error( test2, correct2 );
|
||||
isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
|
||||
}
|
||||
int fail = ! (fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps);
|
||||
|
||||
if( ftz )
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if( (*isFloatResultSubnormalPtr)(correct, float_ulps) )
|
||||
{
|
||||
if( (*isFloatResultSubnormalPtr) (correct2, float_ulps ))
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && test2 == 0.0f );
|
||||
if( ! fail )
|
||||
{
|
||||
err = 0.0f;
|
||||
err2 = 0.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && fabsf(err2) <= float_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( (*isFloatResultSubnormalPtr)(correct2, float_ulps ) )
|
||||
{
|
||||
fail = fail && ! ( test2 == 0.0f && fabsf(err) <= float_ulps);
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
}
|
||||
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
if( IsFloatSubnormal( s[j] ) )
|
||||
{
|
||||
double correctp, correctn;
|
||||
double correct2p, correct2n;
|
||||
float errp, err2p, errn, err2n;
|
||||
|
||||
if( skipNanInf )
|
||||
feclearexcept(FE_OVERFLOW);
|
||||
if ( gTestFastRelaxed )
|
||||
{
|
||||
correctp = f->rfunc.f_fpf( 0.0, &correct2p );
|
||||
correctn = f->rfunc.f_fpf( -0.0, &correct2n );
|
||||
}
|
||||
else
|
||||
{
|
||||
correctp = f->func.f_fpf( 0.0, &correct2p );
|
||||
correctn = f->func.f_fpf( -0.0, &correct2n );
|
||||
}
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
|
||||
if( skipNanInf )
|
||||
{
|
||||
if( fetestexcept(FE_OVERFLOW) )
|
||||
continue;
|
||||
|
||||
// Note: no double rounding here. Reference functions calculate in single precision.
|
||||
if( IsFloatInfinity(correctp) || IsFloatNaN(correctp) ||
|
||||
IsFloatInfinity(correctn) || IsFloatNaN(correctn) ||
|
||||
IsFloatInfinity(correct2p) || IsFloatNaN(correct2p) ||
|
||||
IsFloatInfinity(correct2n) || IsFloatNaN(correct2n) )
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( gTestFastRelaxed )
|
||||
{
|
||||
errp = Abs_Error( test, correctp );
|
||||
err2p = Abs_Error( test, correct2p );
|
||||
errn = Abs_Error( test, correctn );
|
||||
err2n = Abs_Error( test, correct2n );
|
||||
}
|
||||
else
|
||||
{
|
||||
errp = Ulp_Error( test, correctp );
|
||||
err2p = Ulp_Error( test, correct2p );
|
||||
errn = Ulp_Error( test, correctn );
|
||||
err2n = Ulp_Error( test, correct2n );
|
||||
}
|
||||
|
||||
fail = fail && ((!(fabsf(errp) <= float_ulps)) && (!(fabsf(err2p) <= float_ulps)) &&
|
||||
((!(fabsf(errn) <= float_ulps)) && (!(fabsf(err2n) <= float_ulps))) );
|
||||
if( fabsf( errp ) < fabsf(err ) )
|
||||
err = errp;
|
||||
if( fabsf( errn ) < fabsf(err ) )
|
||||
err = errn;
|
||||
if( fabsf( err2p ) < fabsf(err2 ) )
|
||||
err2 = err2p;
|
||||
if( fabsf( err2n ) < fabsf(err2 ) )
|
||||
err2 = err2n;
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( (*isFloatResultSubnormalPtr)( correctp, float_ulps ) || (*isFloatResultSubnormalPtr)( correctn, float_ulps ) )
|
||||
{
|
||||
if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
|
||||
{
|
||||
fail = fail && !( test == 0.0f && test2 == 0.0f);
|
||||
if( ! fail )
|
||||
err = err2 = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! (test == 0.0f && fabsf(err2) <= float_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
|
||||
{
|
||||
fail = fail && ! (test2 == 0.0f && (fabsf(err) <= float_ulps));
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( fabsf(err ) > maxError0 )
|
||||
{
|
||||
maxError0 = fabsf(err);
|
||||
maxErrorVal0 = s[j];
|
||||
}
|
||||
if( fabsf(err2 ) > maxError1 )
|
||||
{
|
||||
maxError1 = fabsf(err2);
|
||||
maxErrorVal1 = s[j];
|
||||
}
|
||||
if( fail )
|
||||
{
|
||||
vlog_error( "\nERROR: %s%s: {%f, %f} ulp error at %a: *{%a, %a} vs. {%a, %a}\n", f->name, sizeNames[k], err, err2, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((float*) gOut_Ref2)[j], test, test2 );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isFract && gIsInRTZMode)
|
||||
(void)set_round(oldRoundMode, kfloat);
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog(".");
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
int TestFunc_Double2_Double(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
int error;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
float maxError0 = 0.0f;
|
||||
float maxError1 = 0.0f;
|
||||
int ftz = f->ftz || gForceFTZ;
|
||||
double maxErrorVal0 = 0.0f;
|
||||
double maxErrorVal1 = 0.0f;
|
||||
size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
|
||||
uint64_t step = bufferSize / sizeof( cl_double );
|
||||
int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
|
||||
Force64BitFPUPrecision();
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info ) ))
|
||||
{
|
||||
return error;
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
double *p = (double *)gIn;
|
||||
if( gWimpyMode )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
p[j] = DoubleFromUInt32((uint32_t) i + j);
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
double *r = (double *)gOut_Ref;
|
||||
double *r2 = (double *)gOut_Ref2;
|
||||
double *s = (double *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
{
|
||||
long double dd;
|
||||
r[j] = (double) f->dfunc.f_fpf( s[j], &dd );
|
||||
r2[j] = (double) dd;
|
||||
}
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray2 failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
|
||||
//Verify data
|
||||
uint64_t *t = (uint64_t *)gOut_Ref;
|
||||
uint64_t *t2 = (uint64_t *)gOut_Ref2;
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint64_t *q = (uint64_t *)(gOut[k]);
|
||||
uint64_t *q2 = (uint64_t *)(gOut2[k]);
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] || t2[j] != q2[j] )
|
||||
{
|
||||
double test = ((double*) q)[j];
|
||||
double test2 = ((double*) q2)[j];
|
||||
long double correct2;
|
||||
long double correct = f->dfunc.f_fpf( s[j], &correct2 );
|
||||
float err = Ulp_Error_Double( test, correct );
|
||||
float err2 = Ulp_Error_Double( test2, correct2 );
|
||||
int fail = ! (fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps);
|
||||
if( ftz )
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
|
||||
{
|
||||
if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && test2 == 0.0f );
|
||||
if( ! fail )
|
||||
{
|
||||
err = 0.0f;
|
||||
err2 = 0.0f;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && fabsf(err2) <= f->double_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
|
||||
{
|
||||
fail = fail && ! ( test2 == 0.0f && fabsf(err) <= f->double_ulps);
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
if( IsDoubleSubnormal( s[j] ) )
|
||||
{
|
||||
long double correct2p, correct2n;
|
||||
long double correctp = f->dfunc.f_fpf( 0.0, &correct2p );
|
||||
long double correctn = f->dfunc.f_fpf( -0.0, &correct2n );
|
||||
float errp = Ulp_Error_Double( test, correctp );
|
||||
float err2p = Ulp_Error_Double( test, correct2p );
|
||||
float errn = Ulp_Error_Double( test, correctn );
|
||||
float err2n = Ulp_Error_Double( test, correct2n );
|
||||
fail = fail && ((!(fabsf(errp) <= f->double_ulps)) && (!(fabsf(err2p) <= f->double_ulps)) &&
|
||||
((!(fabsf(errn) <= f->double_ulps)) && (!(fabsf(err2n) <= f->double_ulps))) );
|
||||
if( fabsf( errp ) < fabsf(err ) )
|
||||
err = errp;
|
||||
if( fabsf( errn ) < fabsf(err ) )
|
||||
err = errn;
|
||||
if( fabsf( err2p ) < fabsf(err2 ) )
|
||||
err2 = err2p;
|
||||
if( fabsf( err2n ) < fabsf(err2 ) )
|
||||
err2 = err2n;
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( IsDoubleResultSubnormal( correctp, f->double_ulps ) || IsDoubleResultSubnormal( correctn, f->double_ulps ) )
|
||||
{
|
||||
if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
|
||||
{
|
||||
fail = fail && !( test == 0.0f && test2 == 0.0f);
|
||||
if( ! fail )
|
||||
err = err2 = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
fail = fail && ! (test == 0.0f && fabsf(err2) <= f->double_ulps);
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
}
|
||||
else if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
|
||||
{
|
||||
fail = fail && ! (test2 == 0.0f && (fabsf(err) <= f->double_ulps));
|
||||
if( ! fail )
|
||||
err2 = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( fabsf(err ) > maxError0 )
|
||||
{
|
||||
maxError0 = fabsf(err);
|
||||
maxErrorVal0 = s[j];
|
||||
}
|
||||
if( fabsf(err2 ) > maxError1 )
|
||||
{
|
||||
maxError1 = fabsf(err2);
|
||||
maxErrorVal1 = s[j];
|
||||
}
|
||||
if( fail )
|
||||
{
|
||||
vlog_error( "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: *{%.13la, %.13la} vs. {%.13la, %.13la}\n", f->name, sizeNames[k], err, err2, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((double*) gOut_Ref2)[j], test, test2 );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
double *p = (double*) gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
p[j] = DoubleFromUInt32(genrand_int32(d) );
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
802
test_conformance/math_brute_force/unary_two_results_i.c
Normal file
802
test_conformance/math_brute_force/unary_two_results_i.c
Normal file
@@ -0,0 +1,802 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "Utility.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include "FunctionList.h"
|
||||
|
||||
int TestFunc_FloatI_Float(const Func *f, MTdata);
|
||||
int TestFunc_DoubleI_Double(const Func *f, MTdata);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C"
|
||||
#endif
|
||||
const vtbl _unary_two_results_i = { "unary_two_results_i", TestFunc_FloatI_Float, TestFunc_DoubleI_Double };
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" float3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" f0 = (float3)( in[3*i], NAN, NAN ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = f0.y; \n"
|
||||
" out2[3*i+1] = iout.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = f0.x; \n"
|
||||
" out2[3*i] = iout.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
|
||||
}
|
||||
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i], out2 + i );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" double3 f0 = vload3( 0, in + 3 * i );\n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" vstore3( iout, 0, out2 + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" int3 iout = INT_MIN;\n"
|
||||
" double3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" f0 = (double3)( in[3*i], NAN, NAN ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( f0, &iout );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = f0.y; \n"
|
||||
" out2[3*i+1] = iout.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = f0.x; \n"
|
||||
" out2[3*i] = iout.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
|
||||
}
|
||||
|
||||
typedef struct BuildKernelInfo
|
||||
{
|
||||
cl_uint offset; // the first vector size to build
|
||||
cl_kernel *kernels;
|
||||
cl_program *programs;
|
||||
const char *nameInCode;
|
||||
}BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
cl_ulong abs_cl_long( cl_long i );
|
||||
cl_ulong abs_cl_long( cl_long i )
|
||||
{
|
||||
cl_long mask = i >> 63;
|
||||
return (i ^ mask) - mask;
|
||||
}
|
||||
|
||||
int TestFunc_FloatI_Float(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
int error;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
float maxError = 0.0f;
|
||||
int64_t maxError2 = 0;
|
||||
int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
|
||||
float maxErrorVal = 0.0f;
|
||||
float maxErrorVal2 = 0.0f;
|
||||
size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
|
||||
float float_ulps;
|
||||
uint64_t step = bufferSize / sizeof( float );
|
||||
int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
|
||||
cl_ulong maxiError;
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
if( gIsEmbedded )
|
||||
float_ulps = f->float_embedded_ulps;
|
||||
else
|
||||
float_ulps = f->float_ulps;
|
||||
|
||||
maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
if( gWimpyMode )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = (uint32_t) i + j * scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = (uint32_t) i + j;
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
float *r = (float *)gOut_Ref;
|
||||
int *r2 = (int *)gOut_Ref2;
|
||||
float *s = (float *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
r[j] = (float) f->func.f_fpI( s[j], r2+j );
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray2 failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
|
||||
//Verify data
|
||||
uint32_t *t = (uint32_t *)gOut_Ref;
|
||||
int32_t *t2 = (int32_t *)gOut_Ref2;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint32_t *q = (uint32_t *)(gOut[k]);
|
||||
int32_t *q2 = (int32_t *)(gOut2[k]);
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] || t2[j] != q2[j] )
|
||||
{
|
||||
float test = ((float*) q)[j];
|
||||
int correct2 = INT_MIN;
|
||||
double correct = f->func.f_fpI( s[j], &correct2 );
|
||||
float err = Ulp_Error( test, correct );
|
||||
cl_long iErr = (int64_t) q2[j] - (int64_t) correct2;
|
||||
int fail = ! (fabsf(err) <= float_ulps && abs_cl_long( iErr ) <= maxiError );
|
||||
if( ftz )
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if( IsFloatResultSubnormal(correct, float_ulps ) )
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && iErr == 0 );
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
if( IsFloatSubnormal( s[j] ) )
|
||||
{
|
||||
int correct5, correct6;
|
||||
double correct3 = f->func.f_fpI( 0.0, &correct5 );
|
||||
double correct4 = f->func.f_fpI( -0.0, &correct6 );
|
||||
float err2 = Ulp_Error( test, correct3 );
|
||||
float err3 = Ulp_Error( test, correct4 );
|
||||
cl_long iErr2 = (long long) q2[j] - (long long) correct5;
|
||||
cl_long iErr3 = (long long) q2[j] - (long long) correct6;
|
||||
|
||||
// Did +0 work?
|
||||
if( fabsf(err2) <= float_ulps && abs_cl_long( iErr2 ) <= maxiError )
|
||||
{
|
||||
err = err2;
|
||||
iErr = iErr2;
|
||||
fail = 0;
|
||||
}
|
||||
// Did -0 work?
|
||||
else if(fabsf(err3) <= float_ulps && abs_cl_long( iErr3 ) <= maxiError)
|
||||
{
|
||||
err = err3;
|
||||
iErr = iErr3;
|
||||
fail = 0;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( fail && (IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps )) )
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
|
||||
if( ! fail )
|
||||
{
|
||||
err = 0.0f;
|
||||
iErr = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( fabsf(err ) > maxError )
|
||||
{
|
||||
maxError = fabsf(err);
|
||||
maxErrorVal = s[j];
|
||||
}
|
||||
if( llabs(iErr) > maxError2 )
|
||||
{
|
||||
maxError2 = llabs(iErr );
|
||||
maxErrorVal2 = s[j];
|
||||
}
|
||||
|
||||
if( fail )
|
||||
{
|
||||
vlog_error( "\nERROR: %s%s: {%f, %d} ulp error at %a: *{%a, %d} vs. {%a, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
int TestFunc_DoubleI_Double(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
int error;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
float maxError = 0.0f;
|
||||
int64_t maxError2 = 0;
|
||||
int ftz = f->ftz || gForceFTZ;
|
||||
double maxErrorVal = 0.0f;
|
||||
double maxErrorVal2 = 0.0f;
|
||||
cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
|
||||
size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
|
||||
|
||||
uint64_t step = bufferSize / sizeof( double );
|
||||
int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
|
||||
Force64BitFPUPrecision();
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info ) ))
|
||||
{
|
||||
return error;
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
double *p = (double *)gIn;
|
||||
if( gWimpyMode )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
p[j] = DoubleFromUInt32((uint32_t) i + j);
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
memset_pattern4(gOut2[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
double *r = (double *)gOut_Ref;
|
||||
int *r2 = (int *)gOut_Ref2;
|
||||
double *s = (double *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
r[j] = (double) f->dfunc.f_fpI( s[j], r2+j );
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "ReadArray2 failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
|
||||
//Verify data
|
||||
uint64_t *t = (uint64_t *)gOut_Ref;
|
||||
int32_t *t2 = (int32_t *)gOut_Ref2;
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint64_t *q = (uint64_t *)(gOut[k]);
|
||||
int32_t *q2 = (int32_t *)(gOut2[k]);
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] || t2[j] != q2[j] )
|
||||
{
|
||||
double test = ((double*) q)[j];
|
||||
int correct2 = INT_MIN;
|
||||
long double correct = f->dfunc.f_fpI( s[j], &correct2 );
|
||||
float err = Ulp_Error_Double( test, correct );
|
||||
cl_long iErr = (long long) q2[j] - (long long) correct2;
|
||||
int fail = ! (fabsf(err) <= f->double_ulps && abs_cl_long( iErr ) <= maxiError );
|
||||
if( ftz )
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && iErr == 0 );
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
if( IsDoubleSubnormal( s[j] ) )
|
||||
{
|
||||
int correct5, correct6;
|
||||
long double correct3 = f->dfunc.f_fpI( 0.0, &correct5 );
|
||||
long double correct4 = f->dfunc.f_fpI( -0.0, &correct6 );
|
||||
float err2 = Ulp_Error_Double( test, correct3 );
|
||||
float err3 = Ulp_Error_Double( test, correct4 );
|
||||
cl_long iErr2 = (long long) q2[j] - (long long) correct5;
|
||||
cl_long iErr3 = (long long) q2[j] - (long long) correct6;
|
||||
|
||||
// Did +0 work?
|
||||
if( fabsf(err2) <= f->double_ulps && abs_cl_long( iErr2 ) <= maxiError )
|
||||
{
|
||||
err = err2;
|
||||
iErr = iErr2;
|
||||
fail = 0;
|
||||
}
|
||||
// Did -0 work?
|
||||
else if(fabsf(err3) <= f->double_ulps && abs_cl_long( iErr3 ) <= maxiError)
|
||||
{
|
||||
err = err3;
|
||||
iErr = iErr3;
|
||||
fail = 0;
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.4
|
||||
if( fail && (IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )) )
|
||||
{
|
||||
fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
|
||||
if( ! fail )
|
||||
{
|
||||
err = 0.0f;
|
||||
iErr = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( fabsf(err ) > maxError )
|
||||
{
|
||||
maxError = fabsf(err);
|
||||
maxErrorVal = s[j];
|
||||
}
|
||||
if( llabs(iErr) > maxError2 )
|
||||
{
|
||||
maxError2 = llabs(iErr );
|
||||
maxErrorVal2 = s[j];
|
||||
}
|
||||
|
||||
if( fail )
|
||||
{
|
||||
vlog_error( "\nERROR: %sD%s: {%f, %d} ulp error at %.13la: *{%.13la, %d} vs. {%.13la, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
double *p = (double *)gIn;
|
||||
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
p[j] = DoubleFromUInt32(genrand_int32(d));
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILED -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", f->name, sizeNames[j] );
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
|
||||
694
test_conformance/math_brute_force/unary_u.c
Normal file
694
test_conformance/math_brute_force/unary_u.c
Normal file
@@ -0,0 +1,694 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "Utility.h"
|
||||
|
||||
#include <string.h>
|
||||
#include "FunctionList.h"
|
||||
|
||||
int TestFunc_Float_UInt(const Func *f, MTdata);
|
||||
int TestFunc_Double_ULong(const Func *f, MTdata);
|
||||
|
||||
#if defined( __cplusplus)
|
||||
extern "C"
|
||||
#endif
|
||||
const vtbl _unary_u = { "unary_u", TestFunc_Float_UInt, TestFunc_Double_ULong };
|
||||
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
|
||||
|
||||
static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = {
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global uint", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global uint* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" uint3 u0 = vload3( 0, in + 3 * i );\n"
|
||||
" float3 f0 = ", name, "( u0 );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" uint3 u0;\n"
|
||||
" float3 f0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" f0 = ", name, "( u0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = f0.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = f0.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
}
|
||||
|
||||
static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
|
||||
{
|
||||
const char *c[] = {
|
||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global ulong", sizeNames[vectorSize], "* in)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" out[i] = ", name, "( in[i] );\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||
"__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global ulong* in)\n"
|
||||
"{\n"
|
||||
" size_t i = get_global_id(0);\n"
|
||||
" if( i + 1 < get_global_size(0) )\n"
|
||||
" {\n"
|
||||
" ulong3 u0 = vload3( 0, in + 3 * i );\n"
|
||||
" double3 f0 = ", name, "( u0 );\n"
|
||||
" vstore3( f0, 0, out + 3*i );\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
|
||||
" ulong3 u0;\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 1:\n"
|
||||
" u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
|
||||
" break;\n"
|
||||
" case 0:\n"
|
||||
" u0 = (ulong3)( in[3*i], in[3*i+1], 0xdeaddeaddeaddeadUL ); \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" double3 f0 = ", name, "( u0 );\n"
|
||||
" switch( parity )\n"
|
||||
" {\n"
|
||||
" case 0:\n"
|
||||
" out[3*i+1] = f0.y; \n"
|
||||
" // fall through\n"
|
||||
" case 1:\n"
|
||||
" out[3*i] = f0.x; \n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
};
|
||||
|
||||
const char **kern = c;
|
||||
size_t kernSize = sizeof(c)/sizeof(c[0]);
|
||||
|
||||
if( sizeValues[vectorSize] == 3 )
|
||||
{
|
||||
kern = c3;
|
||||
kernSize = sizeof(c3)/sizeof(c3[0]);
|
||||
}
|
||||
|
||||
|
||||
char testName[32];
|
||||
snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
|
||||
|
||||
return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
|
||||
}
|
||||
|
||||
typedef struct BuildKernelInfo
|
||||
{
|
||||
cl_uint offset; // the first vector size to build
|
||||
cl_kernel *kernels;
|
||||
cl_program *programs;
|
||||
const char *nameInCode;
|
||||
}BuildKernelInfo;
|
||||
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
|
||||
static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
|
||||
{
|
||||
BuildKernelInfo *info = (BuildKernelInfo*) p;
|
||||
cl_uint i = info->offset + job_id;
|
||||
return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
|
||||
}
|
||||
|
||||
int TestFunc_Float_UInt(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
int error;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
float maxError = 0.0f;
|
||||
int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
|
||||
float maxErrorVal = 0.0f;
|
||||
size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
|
||||
|
||||
uint64_t step = bufferSize / sizeof( float );
|
||||
int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
|
||||
int isRangeLimited = 0;
|
||||
float float_ulps;
|
||||
float half_sin_cos_tan_limit = 0;
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
if( gIsEmbedded)
|
||||
float_ulps = f->float_embedded_ulps;
|
||||
else
|
||||
float_ulps = f->float_ulps;
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
|
||||
return error;
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") )
|
||||
{
|
||||
isRangeLimited = 1;
|
||||
half_sin_cos_tan_limit = 1.0f + float_ulps * (FLT_EPSILON/2.0f); // out of range results from finite inputs must be in [-1,1]
|
||||
}
|
||||
else if( 0 == strcmp( f->name, "half_tan"))
|
||||
{
|
||||
isRangeLimited = 1;
|
||||
half_sin_cos_tan_limit = INFINITY; // out of range resut from finite inputs must be numeric
|
||||
}
|
||||
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t *)gIn;
|
||||
if( gWimpyMode )
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = (uint32_t) i + j * scale;
|
||||
}
|
||||
else
|
||||
{
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = (uint32_t) i + j;
|
||||
}
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "FAILURE -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
float *r = (float*) gOut_Ref;
|
||||
cl_uint *s = (cl_uint*) gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
r[j] = (float) f->func.f_u( s[j] );
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
|
||||
|
||||
//Verify data
|
||||
uint32_t *t = (uint32_t*) gOut_Ref;
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint32_t *q = (uint32_t*)(gOut[k]);
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] )
|
||||
{
|
||||
float test = ((float*) q)[j];
|
||||
double correct = f->func.f_u( s[j] );
|
||||
float err = Ulp_Error( test, correct );
|
||||
int fail = ! (fabsf(err) <= float_ulps);
|
||||
|
||||
// half_sin/cos/tan are only valid between +-2**16, Inf, NaN
|
||||
if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
|
||||
{
|
||||
if( fabsf( test ) <= half_sin_cos_tan_limit )
|
||||
{
|
||||
err = 0;
|
||||
fail = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if( fail )
|
||||
{
|
||||
if( ftz )
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if( IsFloatResultSubnormal(correct, float_ulps) )
|
||||
{
|
||||
fail = fail && ( test != 0.0f );
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( fabsf(err ) > maxError )
|
||||
{
|
||||
maxError = fabsf(err);
|
||||
maxErrorVal = s[j];
|
||||
}
|
||||
if( fail )
|
||||
{
|
||||
vlog_error( "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", f->name, sizeNames[k], err, ((uint32_t*) gIn)[j], ((float*) gOut_Ref)[j], test );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
uint32_t *p = (uint32_t*)gIn;
|
||||
if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
((float*)p)[j] = (float) genrand_real1(d);
|
||||
else if( strstr( f->name, "log" ) )
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = genrand_int32(d) & 0x7fffffff;
|
||||
else
|
||||
for( j = 0; j < bufferSize / sizeof( float ); j++ )
|
||||
p[j] = genrand_int32(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_float);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILURE -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static cl_ulong random64( MTdata d )
|
||||
{
|
||||
return (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
|
||||
}
|
||||
|
||||
int TestFunc_Double_ULong(const Func *f, MTdata d)
|
||||
{
|
||||
uint64_t i;
|
||||
uint32_t j, k;
|
||||
int error;
|
||||
cl_program programs[ VECTOR_SIZE_COUNT ];
|
||||
cl_kernel kernels[ VECTOR_SIZE_COUNT ];
|
||||
float maxError = 0.0f;
|
||||
int ftz = f->ftz || gForceFTZ;
|
||||
double maxErrorVal = 0.0f;
|
||||
size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
|
||||
uint64_t step = bufferSize / sizeof( cl_double );
|
||||
|
||||
logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
|
||||
if( gWimpyMode )
|
||||
{
|
||||
step = (1ULL<<32) * gWimpyReductionFactor / (512);
|
||||
}
|
||||
Force64BitFPUPrecision();
|
||||
|
||||
// Init the kernels
|
||||
BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
|
||||
if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
|
||||
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
||||
&build_info ) ))
|
||||
{
|
||||
return error;
|
||||
}
|
||||
/*
|
||||
for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
|
||||
if( (error = BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
|
||||
return error;
|
||||
*/
|
||||
|
||||
for( i = 0; i < (1ULL<<32); i += step )
|
||||
{
|
||||
//Init input array
|
||||
cl_ulong *p = (cl_ulong *)gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_ulong ); j++ )
|
||||
p[j] = random64(d);
|
||||
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
// write garbage into output arrays
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
uint32_t pattern = 0xffffdead;
|
||||
memset_pattern4(gOut[j], &pattern, bufferSize);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "FAILURE -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that moving
|
||||
if( (error = clFlush(gQueue) ))
|
||||
vlog( "clFlush failed\n" );
|
||||
|
||||
//Calculate the correctly rounded reference result
|
||||
double *r = (double*) gOut_Ref;
|
||||
cl_ulong *s = (cl_ulong*) gIn;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
r[j] = (double) f->dfunc.f_u( s[j] );
|
||||
|
||||
// Read the data back
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
|
||||
{
|
||||
vlog_error( "ReadArray failed %d\n", error );
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
if( gSkipCorrectnessTesting )
|
||||
break;
|
||||
|
||||
|
||||
//Verify data
|
||||
uint64_t *t = (uint64_t*) gOut_Ref;
|
||||
for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
|
||||
{
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
uint64_t *q = (uint64_t*)(gOut[k]);
|
||||
|
||||
// If we aren't getting the correctly rounded result
|
||||
if( t[j] != q[j] )
|
||||
{
|
||||
double test = ((double*) q)[j];
|
||||
long double correct = f->dfunc.f_u( s[j] );
|
||||
float err = Ulp_Error_Double(test, correct);
|
||||
int fail = ! (fabsf(err) <= f->double_ulps);
|
||||
|
||||
// half_sin/cos/tan are only valid between +-2**16, Inf, NaN
|
||||
if( fail )
|
||||
{
|
||||
if( ftz )
|
||||
{
|
||||
// retry per section 6.5.3.2
|
||||
if( IsDoubleResultSubnormal(correct, f->double_ulps) )
|
||||
{
|
||||
fail = fail && ( test != 0.0 );
|
||||
if( ! fail )
|
||||
err = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( fabsf(err ) > maxError )
|
||||
{
|
||||
maxError = fabsf(err);
|
||||
maxErrorVal = s[j];
|
||||
}
|
||||
if( fail )
|
||||
{
|
||||
vlog_error( "\n%s%sD: %f ulp error at 0x%16.16llx: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, ((uint64_t*) gIn)[j], ((double*) gOut_Ref)[j], test );
|
||||
error = -1;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( 0 == (i & 0x0fffffff) )
|
||||
{
|
||||
if (gVerboseBruteForce)
|
||||
{
|
||||
vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, bufferSize);
|
||||
} else
|
||||
{
|
||||
vlog("." );
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
{
|
||||
if( gWimpyMode )
|
||||
vlog( "Wimp pass" );
|
||||
else
|
||||
vlog( "passed" );
|
||||
}
|
||||
|
||||
if( gMeasureTimes )
|
||||
{
|
||||
//Init input array
|
||||
double *p = (double*) gIn;
|
||||
|
||||
for( j = 0; j < bufferSize / sizeof( double ); j++ )
|
||||
p[j] = random64(d);
|
||||
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
|
||||
{
|
||||
vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
// Run the kernels
|
||||
for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
|
||||
{
|
||||
size_t vectorSize = sizeValues[j] * sizeof(cl_double);
|
||||
size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
|
||||
if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
|
||||
|
||||
double sum = 0.0;
|
||||
double bestTime = INFINITY;
|
||||
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
||||
{
|
||||
uint64_t startTime = GetTime();
|
||||
if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
|
||||
{
|
||||
vlog_error( "FAILURE -- could not execute kernel\n" );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Make sure OpenCL is done
|
||||
if( (error = clFinish(gQueue) ) )
|
||||
{
|
||||
vlog_error( "Error %d at clFinish\n", error );
|
||||
goto exit;
|
||||
}
|
||||
|
||||
uint64_t endTime = GetTime();
|
||||
double time = SubtractTime( endTime, startTime );
|
||||
sum += time;
|
||||
if( time < bestTime )
|
||||
bestTime = time;
|
||||
}
|
||||
|
||||
if( gReportAverageTimes )
|
||||
bestTime = sum / PERF_LOOP_COUNT;
|
||||
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
|
||||
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
|
||||
}
|
||||
for( ; j < gMaxVectorSizeIndex; j++ )
|
||||
vlog( "\t -- " );
|
||||
}
|
||||
|
||||
if( ! gSkipCorrectnessTesting )
|
||||
vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
|
||||
vlog( "\n" );
|
||||
|
||||
exit:
|
||||
// Release
|
||||
for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
|
||||
{
|
||||
clReleaseKernel(kernels[k]);
|
||||
clReleaseProgram(programs[k]);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user