mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 14:09:03 +00:00
The maintenance of the conformance tests is moving to Github. This commit contains all the changes that have been done in Gitlab since the first public release of the conformance tests. Signed-off-by: Kevin Petit <kevin.petit@arm.com>
942 lines
36 KiB
C++
942 lines
36 KiB
C++
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include "testBase.h"
|
|
#include "../../test_common/harness/typeWrappers.h"
|
|
#include "../../test_common/harness/conversions.h"
|
|
#include "../../test_common/harness/errorHelpers.h"
|
|
|
|
const char *crossKernelSource_double =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double4 *sourceA, __global double4 *sourceB, __global double4 *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
const char *crossKernelSource_doubleV3 =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double *sourceA, __global double *sourceB, __global double *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" vstore3( cross( vload3( tid, sourceA), vload3( tid, sourceB) ), tid, destValues);\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
const char *twoToFloatKernelPattern_double =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
const char *twoToFloatKernelPattern_doubleV3 =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" destValues[tid] = %s( vload3( tid, (__global double*) sourceA), vload3( tid, (__global double*) sourceB ) );\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
const char *oneToFloatKernelPattern_double =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" destValues[tid] = %s( sourceA[tid] );\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
const char *oneToFloatKernelPattern_doubleV3 =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" destValues[tid] = %s( vload3( tid, (__global double*) sourceA) );\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
const char *oneToOneKernelPattern_double =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" destValues[tid] = %s( sourceA[tid] );\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
const char *oneToOneKernelPattern_doubleV3 =
|
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
|
"__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
|
|
"{\n"
|
|
" int tid = get_global_id(0);\n"
|
|
" vstore3( %s( vload3( tid, (__global double*) sourceA) ), tid, (__global double*) destValues );\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
#define TEST_SIZE (1 << 20)
|
|
|
|
double verifyLength_double( double *srcA, size_t vecSize );
|
|
double verifyDistance_double( double *srcA, double *srcB, size_t vecSize );
|
|
|
|
|
|
|
|
void vector2string_double( char *string, double *vector, size_t elements )
|
|
{
|
|
*string++ = '{';
|
|
*string++ = ' ';
|
|
string += sprintf( string, "%a", vector[0] );
|
|
size_t i;
|
|
for( i = 1; i < elements; i++ )
|
|
string += sprintf( string, ", %a", vector[i] );
|
|
*string++ = ' ';
|
|
*string++ = '}';
|
|
*string = '\0';
|
|
}
|
|
|
|
void fillWithTrickyNumbers_double( double *aVectors, double *bVectors, size_t vecSize )
|
|
{
|
|
static const cl_double trickyValues[] = { -FLT_EPSILON, FLT_EPSILON,
|
|
MAKE_HEX_DOUBLE(0x1.0p511, 0x1L, 511), MAKE_HEX_DOUBLE(0x1.8p511, 0x18L, 507), MAKE_HEX_DOUBLE(0x1.0p512, 0x1L, 512), MAKE_HEX_DOUBLE(-0x1.0p511, -0x1L, 511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p512, -0x1L, 512),
|
|
MAKE_HEX_DOUBLE(0x1.0p-511, 0x1L, -511), MAKE_HEX_DOUBLE(0x1.8p-511, 0x18L, -515), MAKE_HEX_DOUBLE(0x1.0p-512, 0x1L, -512), MAKE_HEX_DOUBLE(-0x1.0p-511, -0x1L, -511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p-512, -0x1L, -512),
|
|
DBL_MAX / 2., -DBL_MAX / 2., INFINITY, -INFINITY, 0., -0. };
|
|
static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] );
|
|
static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount };
|
|
size_t i, j, k;
|
|
|
|
for( j = 0; j < vecSize; j++ )
|
|
for( k = 0; k < vecSize; k++ )
|
|
for( i = 0; i < trickyCount; i++ )
|
|
aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i];
|
|
|
|
if( bVectors )
|
|
{
|
|
size_t copySize = vecSize * vecSize * trickyCount;
|
|
memset( bVectors, 0, sizeof(double) * copySize );
|
|
memset( aVectors + copySize, 0, sizeof(double) * copySize );
|
|
memcpy( bVectors + copySize, aVectors, sizeof(double) * copySize );
|
|
}
|
|
}
|
|
|
|
|
|
void cross_product_double( const double *vecA, const double *vecB, double *outVector, double *errorTolerances, double ulpTolerance )
|
|
{
|
|
outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] );
|
|
outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] );
|
|
outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] );
|
|
outVector[ 3 ] = 0.0f;
|
|
|
|
errorTolerances[ 0 ] = fmax( fabs( vecA[ 1 ] ), fmax( fabs( vecB[ 2 ] ), fmax( fabs( vecA[ 2 ] ), fabs( vecB[ 1 ] ) ) ) );
|
|
errorTolerances[ 1 ] = fmax( fabs( vecA[ 2 ] ), fmax( fabs( vecB[ 0 ] ), fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 2 ] ) ) ) );
|
|
errorTolerances[ 2 ] = fmax( fabs( vecA[ 0 ] ), fmax( fabs( vecB[ 1 ] ), fmax( fabs( vecA[ 1 ] ), fabs( vecB[ 0 ] ) ) ) );
|
|
|
|
errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON ); // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result
|
|
errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON );
|
|
errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON );
|
|
}
|
|
|
|
int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
|
|
{
|
|
cl_int error;
|
|
cl_ulong maxAllocSize, maxGlobalMemSize;
|
|
|
|
error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
|
|
error |= clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( maxGlobalMemSize ), &maxGlobalMemSize, NULL );
|
|
test_error( error, "Unable to get device config" );
|
|
|
|
log_info("Device supports:\nCL_DEVICE_MAX_MEM_ALLOC_SIZE: %gMB\nCL_DEVICE_GLOBAL_MEM_SIZE: %gMB\n",
|
|
maxGlobalMemSize/(1024.0*1024.0), maxAllocSize/(1024.0*1024.0));
|
|
|
|
if (maxGlobalMemSize > (cl_ulong)SIZE_MAX) {
|
|
maxGlobalMemSize = (cl_ulong)SIZE_MAX;
|
|
}
|
|
|
|
unsigned int size;
|
|
unsigned int bufSize;
|
|
unsigned int adjustment;
|
|
int vecsize;
|
|
|
|
adjustment = 32*1024*1024; /* Try to allocate a bit less than the limits */
|
|
for(vecsize = 3; vecsize <= 4; ++vecsize)
|
|
{
|
|
/* Make sure we adhere to the maximum individual allocation size and global memory size limits. */
|
|
size = TEST_SIZE;
|
|
bufSize = sizeof(cl_double) * TEST_SIZE * vecsize;
|
|
|
|
while ((bufSize > (maxAllocSize - adjustment)) || (3*bufSize > (maxGlobalMemSize - adjustment))) {
|
|
size /= 2;
|
|
bufSize = sizeof(cl_double) * size * vecsize;
|
|
}
|
|
|
|
/* Perform the test */
|
|
clProgramWrapper program;
|
|
clKernelWrapper kernel;
|
|
clMemWrapper streams[3];
|
|
cl_double testVector[4];
|
|
int error, i;
|
|
size_t threads[1], localThreads[1];
|
|
BufferOwningPtr<cl_double> A(malloc(bufSize));
|
|
BufferOwningPtr<cl_double> B(malloc(bufSize));
|
|
BufferOwningPtr<cl_double> C(malloc(bufSize));
|
|
cl_double *inDataA = A;
|
|
cl_double *inDataB = B;
|
|
cl_double *outData = C;
|
|
|
|
/* Create kernels */
|
|
if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSource_doubleV3 : &crossKernelSource_double, "sample_test" ) )
|
|
return -1;
|
|
|
|
/* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
|
|
for( i = 0; i < size * vecsize; i++ )
|
|
{
|
|
inDataA[ i ] = get_random_double( -512.f, 512.f, d );
|
|
inDataB[ i ] = get_random_double( -512.f, 512.f, d );
|
|
}
|
|
fillWithTrickyNumbers_double( inDataA, inDataB, vecsize );
|
|
|
|
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), bufSize, inDataA, NULL);
|
|
if( streams[0] == NULL )
|
|
{
|
|
log_error("ERROR: Creating input array A failed!\n");
|
|
return -1;
|
|
}
|
|
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), bufSize, inDataB, NULL);
|
|
if( streams[1] == NULL )
|
|
{
|
|
log_error("ERROR: Creating input array B failed!\n");
|
|
return -1;
|
|
}
|
|
streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), bufSize, NULL, NULL);
|
|
if( streams[2] == NULL )
|
|
{
|
|
log_error("ERROR: Creating output array failed!\n");
|
|
return -1;
|
|
}
|
|
|
|
/* Assign streams and execute */
|
|
for( i = 0; i < 3; i++ )
|
|
{
|
|
error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
|
|
test_error( error, "Unable to set indexed kernel arguments" );
|
|
}
|
|
|
|
/* Run the kernel */
|
|
threads[0] = size;
|
|
|
|
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
|
|
test_error( error, "Unable to get work group size to use" );
|
|
|
|
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
|
|
test_error( error, "Unable to execute test kernel" );
|
|
|
|
/* Now get the results */
|
|
error = clEnqueueReadBuffer( queue, streams[2], true, 0, bufSize, outData, 0, NULL, NULL );
|
|
test_error( error, "Unable to read output array!" );
|
|
|
|
/* And verify! */
|
|
for( i = 0; i < size; i++ )
|
|
{
|
|
double errorTolerances[ 4 ];
|
|
// On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
|
|
cross_product_double( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f );
|
|
|
|
double errs[] = { fabs( testVector[ 0 ] - outData[ i * vecsize + 0 ] ),
|
|
fabs( testVector[ 1 ] - outData[ i * vecsize + 1 ] ),
|
|
fabs( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) };
|
|
|
|
if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
|
|
{
|
|
log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
|
|
i, testVector[0], testVector[1], testVector[2], testVector[3],
|
|
outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
|
|
log_error( " Input: (%a %a %a) and (%a %a %a)\n",
|
|
inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
|
|
inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
|
|
log_error( " Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n",
|
|
errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] );
|
|
log_error(" ulp %g\n", Ulp_Error_Double( outData[ i * vecsize + 1 ], testVector[ 1 ] ) );
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
double getMaxValue_double( double vecA[], double vecB[], size_t vecSize )
|
|
{
|
|
double a = fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 0 ] ) );
|
|
for( size_t i = 1; i < vecSize; i++ )
|
|
a = fmax( fabs( vecA[ i ] ), fmax( fabs( vecB[ i ] ), a ) );
|
|
return a;
|
|
}
|
|
|
|
typedef double (*twoToFloatVerifyFn_double)( double *srcA, double *srcB, size_t vecSize );
|
|
|
|
int test_twoToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
|
|
size_t vecSize, twoToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
|
|
{
|
|
clProgramWrapper program;
|
|
clKernelWrapper kernel;
|
|
clMemWrapper streams[3];
|
|
int error;
|
|
size_t i, threads[1], localThreads[1];
|
|
char kernelSource[10240];
|
|
char *programPtr;
|
|
char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
|
|
BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
|
|
BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
|
|
BufferOwningPtr<cl_double> C(malloc(sizeof(cl_double) * TEST_SIZE));
|
|
|
|
cl_double *inDataA = A;
|
|
cl_double *inDataB = B;
|
|
cl_double *outData = C;
|
|
|
|
/* Create the source */
|
|
sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPattern_doubleV3 : twoToFloatKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
|
|
|
|
/* Create kernels */
|
|
programPtr = kernelSource;
|
|
if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
|
|
return -1;
|
|
|
|
/* Generate some streams */
|
|
for( i = 0; i < TEST_SIZE * vecSize; i++ )
|
|
{
|
|
inDataA[ i ] = any_double(d);
|
|
inDataB[ i ] = any_double(d);
|
|
}
|
|
fillWithTrickyNumbers_double( inDataA, inDataB, vecSize );
|
|
|
|
|
|
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
|
|
if( streams[0] == NULL )
|
|
{
|
|
log_error("ERROR: Creating input array A failed!\n");
|
|
return -1;
|
|
}
|
|
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataB, NULL);
|
|
if( streams[1] == NULL )
|
|
{
|
|
log_error("ERROR: Creating input array B failed!\n");
|
|
return -1;
|
|
}
|
|
streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_double) * TEST_SIZE, NULL, NULL);
|
|
if( streams[2] == NULL )
|
|
{
|
|
log_error("ERROR: Creating output array failed!\n");
|
|
return -1;
|
|
}
|
|
|
|
/* Assign streams and execute */
|
|
for( i = 0; i < 3; i++ )
|
|
{
|
|
error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]);
|
|
test_error( error, "Unable to set indexed kernel arguments" );
|
|
}
|
|
|
|
/* Run the kernel */
|
|
threads[0] = TEST_SIZE;
|
|
|
|
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
|
|
test_error( error, "Unable to get work group size to use" );
|
|
|
|
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
|
|
test_error( error, "Unable to execute test kernel" );
|
|
|
|
/* Now get the results */
|
|
error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
|
|
test_error( error, "Unable to read output array!" );
|
|
|
|
/* And verify! */
|
|
for( i = 0; i < TEST_SIZE; i++ )
|
|
{
|
|
double expected = verifyFn( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
|
|
if( (double) expected != outData[ i ] )
|
|
{
|
|
if( isnan(expected) && isnan( outData[i] ) )
|
|
continue;
|
|
|
|
if( ulpLimit < 0 )
|
|
{
|
|
// Limit below zero means we need to test via a computed error (like cross product does)
|
|
double maxValue =
|
|
getMaxValue_double( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
|
|
|
|
// In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors)
|
|
double errorTolerance = maxValue * maxValue * ( 2.f * (double)vecSize - 1.f ) * FLT_EPSILON;
|
|
|
|
// Limit below zero means test via epsilon instead
|
|
double error = fabs( (double)expected - (double)outData[ i ] );
|
|
if( error > errorTolerance )
|
|
{
|
|
|
|
log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n",
|
|
(int)i, (int)vecSize, expected,
|
|
outData[ i ],
|
|
inDataA[i*vecSize],
|
|
inDataB[i*vecSize],
|
|
(double)error,
|
|
(double)errorTolerance );
|
|
|
|
char vecA[1000], vecB[1000];
|
|
vector2string_double( vecA, inDataA + i * vecSize, vecSize );
|
|
vector2string_double( vecB, inDataB + i * vecSize, vecSize );
|
|
log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
double error = Ulp_Error_Double( outData[ i ],
|
|
expected );
|
|
if( fabs(error) > ulpLimit )
|
|
{
|
|
log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n",
|
|
(int)i, (int)vecSize, expected,
|
|
outData[ i ],
|
|
inDataA[i*vecSize],
|
|
inDataB[i*vecSize],
|
|
error );
|
|
|
|
char vecA[1000], vecB[1000];
|
|
vector2string_double( vecA, inDataA + i * vecSize, vecSize );
|
|
vector2string_double( vecB, inDataB + i * vecSize, vecSize );
|
|
log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
double verifyDot_double( double *srcA, double *srcB, size_t vecSize )
|
|
{
|
|
double total = 0.f;
|
|
|
|
for( unsigned int i = 0; i < vecSize; i++ )
|
|
total += (double)srcA[ i ] * (double)srcB[ i ];
|
|
|
|
return total;
|
|
}
|
|
|
|
int test_geom_dot_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
|
|
{
|
|
size_t sizes[] = { 1, 2, 3, 4, 0 };
|
|
unsigned int size;
|
|
int retVal = 0;
|
|
|
|
|
|
for( size = 0; sizes[ size ] != 0 ; size++ )
|
|
{
|
|
if( test_twoToFloat_kernel_double( queue, context, "dot", sizes[ size ], verifyDot_double, -1.0f /*magic value*/, d ) != 0 )
|
|
{
|
|
log_error( " dot double vector size %d FAILED\n", (int)sizes[ size ] );
|
|
retVal = -1;
|
|
}
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
|
|
int test_geom_fast_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
|
|
{
|
|
size_t sizes[] = { 1, 2, 3, 4, 0 };
|
|
unsigned int size;
|
|
int retVal = 0;
|
|
|
|
abort(); //there is no double precision fast_distance
|
|
|
|
for( size = 0; sizes[ size ] != 0 ; size++ )
|
|
{
|
|
double maxUlps = 8192.0f + // error in sqrt
|
|
0.5f * // effect on e of taking sqrt( x + e )
|
|
( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
|
|
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
|
|
|
|
if( test_twoToFloat_kernel_double( queue, context, "fast_distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
|
|
{
|
|
log_error( " fast_distance double vector size %d FAILED\n", (int)sizes[ size ] );
|
|
retVal = -1;
|
|
}
|
|
else
|
|
{
|
|
log_info( " fast_distance double vector size %d passed\n", (int)sizes[ size ] );
|
|
}
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
|
|
double verifyDistance_double( double *srcA, double *srcB, size_t vecSize )
|
|
{
|
|
unsigned int i;
|
|
double diff[4];
|
|
|
|
for( i = 0; i < vecSize; i++ )
|
|
diff[i] = srcA[i] - srcB[i];
|
|
|
|
return verifyLength_double( diff, vecSize );
|
|
}
|
|
|
|
int test_geom_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
|
|
{
|
|
size_t sizes[] = { 1, 2, 3, 4, 0 };
|
|
unsigned int size;
|
|
int retVal = 0;
|
|
|
|
for( size = 0; sizes[ size ] != 0 ; size++ )
|
|
{
|
|
double maxUlps = 3.0f + // error in sqrt
|
|
0.5f * // effect on e of taking sqrt( x + e )
|
|
( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
|
|
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
|
|
|
|
maxUlps *= 2.0; // our reference code may be in error too
|
|
|
|
if( test_twoToFloat_kernel_double( queue, context, "distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
|
|
{
|
|
log_error( " distance double vector size %d FAILED\n", (int)sizes[ size ] );
|
|
retVal = -1;
|
|
}
|
|
else
|
|
{
|
|
log_info( " distance double vector size %d passed\n", (int)sizes[ size ] );
|
|
}
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
typedef double (*oneToFloatVerifyFn_double)( double *srcA, size_t vecSize );
|
|
|
|
int test_oneToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
|
|
size_t vecSize, oneToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
|
|
{
|
|
clProgramWrapper program;
|
|
clKernelWrapper kernel;
|
|
clMemWrapper streams[2];
|
|
BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
|
|
BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE));
|
|
int error;
|
|
size_t i, threads[1], localThreads[1];
|
|
char kernelSource[10240];
|
|
char *programPtr;
|
|
char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
|
|
cl_double *inDataA = A;
|
|
cl_double *outData = B;
|
|
|
|
/* Create the source */
|
|
sprintf( kernelSource, vecSize == 3 ? oneToFloatKernelPattern_doubleV3 : oneToFloatKernelPattern_double, sizeNames[vecSize-1], fnName );
|
|
|
|
/* Create kernels */
|
|
programPtr = kernelSource;
|
|
if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
|
|
return -1;
|
|
|
|
/* Generate some streams */
|
|
for( i = 0; i < TEST_SIZE * vecSize; i++ )
|
|
inDataA[ i ] = any_double(d);
|
|
|
|
fillWithTrickyNumbers_double( inDataA, NULL, vecSize );
|
|
|
|
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
|
|
if( streams[0] == NULL )
|
|
{
|
|
log_error("ERROR: Creating input array A failed!\n");
|
|
return -1;
|
|
}
|
|
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_double) * TEST_SIZE, NULL, NULL);
|
|
if( streams[1] == NULL )
|
|
{
|
|
log_error("ERROR: Creating output array failed!\n");
|
|
return -1;
|
|
}
|
|
|
|
/* Assign streams and execute */
|
|
error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] );
|
|
test_error( error, "Unable to set indexed kernel arguments" );
|
|
error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] );
|
|
test_error( error, "Unable to set indexed kernel arguments" );
|
|
|
|
/* Run the kernel */
|
|
threads[0] = TEST_SIZE;
|
|
|
|
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
|
|
test_error( error, "Unable to get work group size to use" );
|
|
|
|
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
|
|
test_error( error, "Unable to execute test kernel" );
|
|
|
|
/* Now get the results */
|
|
error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
|
|
test_error( error, "Unable to read output array!" );
|
|
|
|
/* And verify! */
|
|
for( i = 0; i < TEST_SIZE; i++ )
|
|
{
|
|
double expected = verifyFn( inDataA + i * vecSize, vecSize );
|
|
if( (double) expected != outData[ i ] )
|
|
{
|
|
double ulps = Ulp_Error_Double( outData[i], expected );
|
|
if( fabs( ulps ) <= ulpLimit )
|
|
continue;
|
|
|
|
// We have to special case NAN
|
|
if( isnan( outData[ i ] ) && isnan( expected ) )
|
|
continue;
|
|
|
|
if(! (fabs(ulps) < ulpLimit) )
|
|
{
|
|
log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n",
|
|
(int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], ulps );
|
|
char vecA[1000];
|
|
vector2string_double( vecA, inDataA + i * vecSize, vecSize );
|
|
log_error( "\tvector: %s", vecA );
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
double verifyLength_double( double *srcA, size_t vecSize )
|
|
{
|
|
double total = 0;
|
|
unsigned int i;
|
|
|
|
// We calculate the distance as a double, to try and make up for the fact that
|
|
// the GPU has better precision distance since it's a single op
|
|
for( i = 0; i < vecSize; i++ )
|
|
total += srcA[i] * srcA[i];
|
|
|
|
// Deal with spurious overflow
|
|
if( total == INFINITY )
|
|
{
|
|
total = 0.0;
|
|
for( i = 0; i < vecSize; i++ )
|
|
{
|
|
double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p-600, 0x1LL, -600);
|
|
total += f * f;
|
|
}
|
|
|
|
return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p600, 0x1LL, 600);
|
|
}
|
|
|
|
// Deal with spurious underflow
|
|
if( total < 4 /*max vector length*/ * DBL_MIN / DBL_EPSILON )
|
|
{
|
|
total = 0.0;
|
|
for( i = 0; i < vecSize; i++ )
|
|
{
|
|
double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700);
|
|
total += f * f;
|
|
}
|
|
|
|
return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p-700, 0x1LL, -700);
|
|
}
|
|
|
|
return sqrt( total );
|
|
}
|
|
|
|
int test_geom_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
|
|
{
|
|
size_t sizes[] = { 1, 2, 3, 4, 0 };
|
|
unsigned int size;
|
|
int retVal = 0;
|
|
|
|
for( size = 0; sizes[ size ] != 0 ; size++ )
|
|
{
|
|
double maxUlps = 3.0f + // error in sqrt
|
|
0.5f * // effect on e of taking sqrt( x + e )
|
|
( 0.5f * (double) sizes[size] + // cumulative error for multiplications
|
|
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
|
|
|
|
maxUlps *= 2.0; // our reference code may be in error too
|
|
if( test_oneToFloat_kernel_double( queue, context, "length", sizes[ size ], verifyLength_double, maxUlps, d ) != 0 )
|
|
{
|
|
log_error( " length double vector size %d FAILED\n", (int)sizes[ size ] );
|
|
retVal = -1;
|
|
}
|
|
else
|
|
{
|
|
log_info( " length double vector size %d passed\n", (int)sizes[ size ] );
|
|
}
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
|
|
double verifyFastLength_double( double *srcA, size_t vecSize )
|
|
{
|
|
double total = 0;
|
|
unsigned int i;
|
|
|
|
// We calculate the distance as a double, to try and make up for the fact that
|
|
// the GPU has better precision distance since it's a single op
|
|
for( i = 0; i < vecSize; i++ )
|
|
{
|
|
total += (double)srcA[i] * (double)srcA[i];
|
|
}
|
|
|
|
return sqrt( total );
|
|
}
|
|
|
|
int test_geom_fast_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
|
|
{
|
|
size_t sizes[] = { 1, 2, 3, 4, 0 };
|
|
unsigned int size;
|
|
int retVal = 0;
|
|
|
|
abort(); //there is no double precision fast_length
|
|
|
|
for( size = 0; sizes[ size ] != 0 ; size++ )
|
|
{
|
|
double maxUlps = 8192.0f + // error in half_sqrt
|
|
0.5f * // effect on e of taking sqrt( x + e )
|
|
( 0.5f * (double) sizes[size] + // cumulative error for multiplications
|
|
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
|
|
|
|
if( test_oneToFloat_kernel_double( queue, context, "fast_length", sizes[ size ], verifyFastLength_double, maxUlps, d ) != 0 )
|
|
{
|
|
log_error( " fast_length double vector size %d FAILED\n", (int)sizes[ size ] );
|
|
retVal = -1;
|
|
}
|
|
else
|
|
{
|
|
log_info( " fast_length double vector size %d passed\n", (int)sizes[ size ] );
|
|
}
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
|
|
typedef void (*oneToOneVerifyFn_double)( double *srcA, double *dstA, size_t vecSize );
|
|
|
|
int test_oneToOne_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
|
|
size_t vecSize, oneToOneVerifyFn_double verifyFn, double ulpLimit, MTdata d )
|
|
{
|
|
clProgramWrapper program;
|
|
clKernelWrapper kernel;
|
|
clMemWrapper streams[2];
|
|
BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
|
|
BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
|
|
int error;
|
|
size_t i, j, threads[1], localThreads[1];
|
|
char kernelSource[10240];
|
|
char *programPtr;
|
|
char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
|
|
cl_double *inDataA = A;
|
|
cl_double *outData = B;
|
|
|
|
/* Create the source */
|
|
sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPattern_doubleV3 : oneToOneKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
|
|
|
|
/* Create kernels */
|
|
programPtr = kernelSource;
|
|
if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
|
|
return -1;
|
|
|
|
/* initialize data */
|
|
memset( inDataA, 0, vecSize * sizeof( cl_double ) );
|
|
for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
|
|
inDataA[ i ] = any_double(d);
|
|
|
|
|
|
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
|
|
if( streams[0] == NULL )
|
|
{
|
|
log_error("ERROR: Creating input array A failed!\n");
|
|
return -1;
|
|
}
|
|
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_double) * vecSize * TEST_SIZE, NULL, NULL);
|
|
if( streams[1] == NULL )
|
|
{
|
|
log_error("ERROR: Creating output array failed!\n");
|
|
return -1;
|
|
}
|
|
|
|
/* Assign streams and execute */
|
|
error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
|
|
test_error( error, "Unable to set indexed kernel arguments" );
|
|
error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
|
|
test_error( error, "Unable to set indexed kernel arguments" );
|
|
|
|
/* Run the kernel */
|
|
threads[0] = TEST_SIZE;
|
|
|
|
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
|
|
test_error( error, "Unable to get work group size to use" );
|
|
|
|
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
|
|
test_error( error, "Unable to execute test kernel" );
|
|
|
|
/* Now get the results */
|
|
error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE * vecSize, outData, 0, NULL, NULL );
|
|
test_error( error, "Unable to read output array!" );
|
|
|
|
/* And verify! */
|
|
for( i = 0; i < TEST_SIZE; i++ )
|
|
{
|
|
double expected[4];
|
|
verifyFn( inDataA + i * vecSize, expected, vecSize );
|
|
for( j = 0; j < vecSize; j++ )
|
|
{
|
|
// We have to special case NAN
|
|
if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) )
|
|
continue;
|
|
|
|
if( expected[j] != outData[ i *vecSize+j ] )
|
|
{
|
|
double error =
|
|
Ulp_Error_Double( outData[i*vecSize + j ], expected[ j ] );
|
|
if( fabs(error) > ulpLimit )
|
|
{
|
|
log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n",
|
|
(int)i, (int)j, (int)vecSize,
|
|
expected[j], expected[j],
|
|
outData[i*vecSize +j],
|
|
outData[i*vecSize +j], error );
|
|
log_error( " Source: " );
|
|
for( size_t q = 0; q < vecSize; q++ )
|
|
log_error( "%g ", inDataA[ i * vecSize + q ] );
|
|
log_error( "\n : " );
|
|
for( size_t q = 0; q < vecSize; q++ )
|
|
log_error( "%a ", inDataA[ i * vecSize + q ] );
|
|
log_error( "\n" );
|
|
log_error( " Result: " );
|
|
for( size_t q = 0; q < vecSize; q++ )
|
|
log_error( "%g ", outData[i * vecSize + q ] );
|
|
log_error( "\n : " );
|
|
for( size_t q = 0; q < vecSize; q++ )
|
|
log_error( "%a ", outData[i * vecSize + q ] );
|
|
log_error( "\n" );
|
|
log_error( " Expected: " );
|
|
for( size_t q = 0; q < vecSize; q++ )
|
|
log_error( "%g ", expected[ q ] );
|
|
log_error( "\n : " );
|
|
for( size_t q = 0; q < vecSize; q++ )
|
|
log_error( "%a ", expected[ q ] );
|
|
log_error( "\n" );
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void verifyNormalize_double( double *srcA, double *dst, size_t vecSize )
|
|
{
|
|
double total = 0, value;
|
|
unsigned int i;
|
|
|
|
// We calculate everything as a double, to try and make up for the fact that
|
|
// the GPU has better precision distance since it's a single op
|
|
for( i = 0; i < vecSize; i++ )
|
|
total += (double)srcA[i] * (double)srcA[i];
|
|
|
|
if( total < vecSize * DBL_MIN / DBL_EPSILON )
|
|
{ //we may have incurred denormalization loss -- rescale
|
|
total = 0;
|
|
for( i = 0; i < vecSize; i++ )
|
|
{
|
|
dst[i] = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700); //exact
|
|
total += dst[i] * dst[i];
|
|
}
|
|
|
|
//If still zero
|
|
if( total == 0.0 )
|
|
{
|
|
// Special edge case: copy vector over without change
|
|
for( i = 0; i < vecSize; i++ )
|
|
dst[i] = srcA[i];
|
|
return;
|
|
}
|
|
|
|
srcA = dst;
|
|
}
|
|
else if( total == INFINITY )
|
|
{ //we may have incurred spurious overflow
|
|
double scale = MAKE_HEX_DOUBLE(0x1.0p-512, 0x1LL, -512) / vecSize;
|
|
total = 0;
|
|
for( i = 0; i < vecSize; i++ )
|
|
{
|
|
dst[i] = srcA[i] * scale; //exact
|
|
total += dst[i] * dst[i];
|
|
}
|
|
|
|
// If there are infinities here, handle those
|
|
if( total == INFINITY )
|
|
{
|
|
total = 0;
|
|
for( i = 0; i < vecSize; i++ )
|
|
{
|
|
if( isinf(dst[i]) )
|
|
{
|
|
dst[i] = copysign( 1.0, srcA[i] );
|
|
total += 1.0;
|
|
}
|
|
else
|
|
dst[i] = copysign( 0.0, srcA[i] );
|
|
}
|
|
}
|
|
|
|
srcA = dst;
|
|
}
|
|
|
|
value = sqrt( total );
|
|
|
|
for( i = 0; i < vecSize; i++ )
|
|
dst[i] = srcA[i] / value;
|
|
}
|
|
|
|
int test_geom_normalize_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
|
|
{
|
|
size_t sizes[] = { 1, 2, 3, 4, 0 };
|
|
unsigned int size;
|
|
int retVal = 0;
|
|
|
|
for( size = 0; sizes[ size ] != 0 ; size++ )
|
|
{
|
|
double maxUlps = 2.5f + // error in rsqrt + error in multiply
|
|
0.5f * // effect on e of taking sqrt( x + e )
|
|
( 0.5f * (double) sizes[size] + // cumulative error for multiplications
|
|
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
|
|
|
|
maxUlps *= 2.0; //our reference code is not infinitely precise and may have error of its own
|
|
if( test_oneToOne_kernel_double( queue, context, "normalize", sizes[ size ], verifyNormalize_double, maxUlps, d ) != 0 )
|
|
{
|
|
log_error( " normalize double vector size %d FAILED\n", (int)sizes[ size ] );
|
|
retVal = -1;
|
|
}
|
|
else
|
|
{
|
|
log_info( " normalize double vector size %d passed\n", (int)sizes[ size ] );
|
|
}
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
|
|
|
|
|
|
|