Files
OpenCL-CTS/test_conformance/geometrics/test_geometrics_double.cpp
Kevin Petit 53db6e7f9f Synchronise with Khronos-private Gitlab branch
The maintenance of the conformance tests is moving to Github.

This commit contains all the changes that have been done in
Gitlab since the first public release of the conformance tests.

Signed-off-by: Kevin Petit <kevin.petit@arm.com>
2019-03-05 16:24:06 +00:00

942 lines
36 KiB
C++

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "testBase.h"
#include "../../test_common/harness/typeWrappers.h"
#include "../../test_common/harness/conversions.h"
#include "../../test_common/harness/errorHelpers.h"
const char *crossKernelSource_double =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double4 *sourceA, __global double4 *sourceB, __global double4 *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n"
"\n"
"}\n";
const char *crossKernelSource_doubleV3 =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double *sourceA, __global double *sourceB, __global double *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" vstore3( cross( vload3( tid, sourceA), vload3( tid, sourceB) ), tid, destValues);\n"
"\n"
"}\n";
const char *twoToFloatKernelPattern_double =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n"
"\n"
"}\n";
const char *twoToFloatKernelPattern_doubleV3 =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" destValues[tid] = %s( vload3( tid, (__global double*) sourceA), vload3( tid, (__global double*) sourceB ) );\n"
"\n"
"}\n";
const char *oneToFloatKernelPattern_double =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" destValues[tid] = %s( sourceA[tid] );\n"
"\n"
"}\n";
const char *oneToFloatKernelPattern_doubleV3 =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" destValues[tid] = %s( vload3( tid, (__global double*) sourceA) );\n"
"\n"
"}\n";
const char *oneToOneKernelPattern_double =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" destValues[tid] = %s( sourceA[tid] );\n"
"\n"
"}\n";
const char *oneToOneKernelPattern_doubleV3 =
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
" vstore3( %s( vload3( tid, (__global double*) sourceA) ), tid, (__global double*) destValues );\n"
"\n"
"}\n";
#define TEST_SIZE (1 << 20)
double verifyLength_double( double *srcA, size_t vecSize );
double verifyDistance_double( double *srcA, double *srcB, size_t vecSize );
void vector2string_double( char *string, double *vector, size_t elements )
{
*string++ = '{';
*string++ = ' ';
string += sprintf( string, "%a", vector[0] );
size_t i;
for( i = 1; i < elements; i++ )
string += sprintf( string, ", %a", vector[i] );
*string++ = ' ';
*string++ = '}';
*string = '\0';
}
void fillWithTrickyNumbers_double( double *aVectors, double *bVectors, size_t vecSize )
{
static const cl_double trickyValues[] = { -FLT_EPSILON, FLT_EPSILON,
MAKE_HEX_DOUBLE(0x1.0p511, 0x1L, 511), MAKE_HEX_DOUBLE(0x1.8p511, 0x18L, 507), MAKE_HEX_DOUBLE(0x1.0p512, 0x1L, 512), MAKE_HEX_DOUBLE(-0x1.0p511, -0x1L, 511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p512, -0x1L, 512),
MAKE_HEX_DOUBLE(0x1.0p-511, 0x1L, -511), MAKE_HEX_DOUBLE(0x1.8p-511, 0x18L, -515), MAKE_HEX_DOUBLE(0x1.0p-512, 0x1L, -512), MAKE_HEX_DOUBLE(-0x1.0p-511, -0x1L, -511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p-512, -0x1L, -512),
DBL_MAX / 2., -DBL_MAX / 2., INFINITY, -INFINITY, 0., -0. };
static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] );
static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount };
size_t i, j, k;
for( j = 0; j < vecSize; j++ )
for( k = 0; k < vecSize; k++ )
for( i = 0; i < trickyCount; i++ )
aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i];
if( bVectors )
{
size_t copySize = vecSize * vecSize * trickyCount;
memset( bVectors, 0, sizeof(double) * copySize );
memset( aVectors + copySize, 0, sizeof(double) * copySize );
memcpy( bVectors + copySize, aVectors, sizeof(double) * copySize );
}
}
void cross_product_double( const double *vecA, const double *vecB, double *outVector, double *errorTolerances, double ulpTolerance )
{
outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] );
outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] );
outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] );
outVector[ 3 ] = 0.0f;
errorTolerances[ 0 ] = fmax( fabs( vecA[ 1 ] ), fmax( fabs( vecB[ 2 ] ), fmax( fabs( vecA[ 2 ] ), fabs( vecB[ 1 ] ) ) ) );
errorTolerances[ 1 ] = fmax( fabs( vecA[ 2 ] ), fmax( fabs( vecB[ 0 ] ), fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 2 ] ) ) ) );
errorTolerances[ 2 ] = fmax( fabs( vecA[ 0 ] ), fmax( fabs( vecB[ 1 ] ), fmax( fabs( vecA[ 1 ] ), fabs( vecB[ 0 ] ) ) ) );
errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON ); // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result
errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON );
errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON );
}
int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
{
cl_int error;
cl_ulong maxAllocSize, maxGlobalMemSize;
error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
error |= clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( maxGlobalMemSize ), &maxGlobalMemSize, NULL );
test_error( error, "Unable to get device config" );
log_info("Device supports:\nCL_DEVICE_MAX_MEM_ALLOC_SIZE: %gMB\nCL_DEVICE_GLOBAL_MEM_SIZE: %gMB\n",
maxGlobalMemSize/(1024.0*1024.0), maxAllocSize/(1024.0*1024.0));
if (maxGlobalMemSize > (cl_ulong)SIZE_MAX) {
maxGlobalMemSize = (cl_ulong)SIZE_MAX;
}
unsigned int size;
unsigned int bufSize;
unsigned int adjustment;
int vecsize;
adjustment = 32*1024*1024; /* Try to allocate a bit less than the limits */
for(vecsize = 3; vecsize <= 4; ++vecsize)
{
/* Make sure we adhere to the maximum individual allocation size and global memory size limits. */
size = TEST_SIZE;
bufSize = sizeof(cl_double) * TEST_SIZE * vecsize;
while ((bufSize > (maxAllocSize - adjustment)) || (3*bufSize > (maxGlobalMemSize - adjustment))) {
size /= 2;
bufSize = sizeof(cl_double) * size * vecsize;
}
/* Perform the test */
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[3];
cl_double testVector[4];
int error, i;
size_t threads[1], localThreads[1];
BufferOwningPtr<cl_double> A(malloc(bufSize));
BufferOwningPtr<cl_double> B(malloc(bufSize));
BufferOwningPtr<cl_double> C(malloc(bufSize));
cl_double *inDataA = A;
cl_double *inDataB = B;
cl_double *outData = C;
/* Create kernels */
if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSource_doubleV3 : &crossKernelSource_double, "sample_test" ) )
return -1;
/* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
for( i = 0; i < size * vecsize; i++ )
{
inDataA[ i ] = get_random_double( -512.f, 512.f, d );
inDataB[ i ] = get_random_double( -512.f, 512.f, d );
}
fillWithTrickyNumbers_double( inDataA, inDataB, vecsize );
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), bufSize, inDataA, NULL);
if( streams[0] == NULL )
{
log_error("ERROR: Creating input array A failed!\n");
return -1;
}
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), bufSize, inDataB, NULL);
if( streams[1] == NULL )
{
log_error("ERROR: Creating input array B failed!\n");
return -1;
}
streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), bufSize, NULL, NULL);
if( streams[2] == NULL )
{
log_error("ERROR: Creating output array failed!\n");
return -1;
}
/* Assign streams and execute */
for( i = 0; i < 3; i++ )
{
error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
test_error( error, "Unable to set indexed kernel arguments" );
}
/* Run the kernel */
threads[0] = size;
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
test_error( error, "Unable to get work group size to use" );
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
test_error( error, "Unable to execute test kernel" );
/* Now get the results */
error = clEnqueueReadBuffer( queue, streams[2], true, 0, bufSize, outData, 0, NULL, NULL );
test_error( error, "Unable to read output array!" );
/* And verify! */
for( i = 0; i < size; i++ )
{
double errorTolerances[ 4 ];
// On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
cross_product_double( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f );
double errs[] = { fabs( testVector[ 0 ] - outData[ i * vecsize + 0 ] ),
fabs( testVector[ 1 ] - outData[ i * vecsize + 1 ] ),
fabs( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) };
if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
{
log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
i, testVector[0], testVector[1], testVector[2], testVector[3],
outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
log_error( " Input: (%a %a %a) and (%a %a %a)\n",
inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
log_error( " Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n",
errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] );
log_error(" ulp %g\n", Ulp_Error_Double( outData[ i * vecsize + 1 ], testVector[ 1 ] ) );
return -1;
}
}
}
return 0;
}
double getMaxValue_double( double vecA[], double vecB[], size_t vecSize )
{
double a = fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 0 ] ) );
for( size_t i = 1; i < vecSize; i++ )
a = fmax( fabs( vecA[ i ] ), fmax( fabs( vecB[ i ] ), a ) );
return a;
}
typedef double (*twoToFloatVerifyFn_double)( double *srcA, double *srcB, size_t vecSize );
int test_twoToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
size_t vecSize, twoToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
{
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[3];
int error;
size_t i, threads[1], localThreads[1];
char kernelSource[10240];
char *programPtr;
char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
BufferOwningPtr<cl_double> C(malloc(sizeof(cl_double) * TEST_SIZE));
cl_double *inDataA = A;
cl_double *inDataB = B;
cl_double *outData = C;
/* Create the source */
sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPattern_doubleV3 : twoToFloatKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
/* Create kernels */
programPtr = kernelSource;
if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
return -1;
/* Generate some streams */
for( i = 0; i < TEST_SIZE * vecSize; i++ )
{
inDataA[ i ] = any_double(d);
inDataB[ i ] = any_double(d);
}
fillWithTrickyNumbers_double( inDataA, inDataB, vecSize );
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
if( streams[0] == NULL )
{
log_error("ERROR: Creating input array A failed!\n");
return -1;
}
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataB, NULL);
if( streams[1] == NULL )
{
log_error("ERROR: Creating input array B failed!\n");
return -1;
}
streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_double) * TEST_SIZE, NULL, NULL);
if( streams[2] == NULL )
{
log_error("ERROR: Creating output array failed!\n");
return -1;
}
/* Assign streams and execute */
for( i = 0; i < 3; i++ )
{
error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]);
test_error( error, "Unable to set indexed kernel arguments" );
}
/* Run the kernel */
threads[0] = TEST_SIZE;
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
test_error( error, "Unable to get work group size to use" );
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
test_error( error, "Unable to execute test kernel" );
/* Now get the results */
error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
test_error( error, "Unable to read output array!" );
/* And verify! */
for( i = 0; i < TEST_SIZE; i++ )
{
double expected = verifyFn( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
if( (double) expected != outData[ i ] )
{
if( isnan(expected) && isnan( outData[i] ) )
continue;
if( ulpLimit < 0 )
{
// Limit below zero means we need to test via a computed error (like cross product does)
double maxValue =
getMaxValue_double( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
// In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors)
double errorTolerance = maxValue * maxValue * ( 2.f * (double)vecSize - 1.f ) * FLT_EPSILON;
// Limit below zero means test via epsilon instead
double error = fabs( (double)expected - (double)outData[ i ] );
if( error > errorTolerance )
{
log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n",
(int)i, (int)vecSize, expected,
outData[ i ],
inDataA[i*vecSize],
inDataB[i*vecSize],
(double)error,
(double)errorTolerance );
char vecA[1000], vecB[1000];
vector2string_double( vecA, inDataA + i * vecSize, vecSize );
vector2string_double( vecB, inDataB + i * vecSize, vecSize );
log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
return -1;
}
}
else
{
double error = Ulp_Error_Double( outData[ i ],
expected );
if( fabs(error) > ulpLimit )
{
log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n",
(int)i, (int)vecSize, expected,
outData[ i ],
inDataA[i*vecSize],
inDataB[i*vecSize],
error );
char vecA[1000], vecB[1000];
vector2string_double( vecA, inDataA + i * vecSize, vecSize );
vector2string_double( vecB, inDataB + i * vecSize, vecSize );
log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
return -1;
}
}
}
}
return 0;
}
double verifyDot_double( double *srcA, double *srcB, size_t vecSize )
{
double total = 0.f;
for( unsigned int i = 0; i < vecSize; i++ )
total += (double)srcA[ i ] * (double)srcB[ i ];
return total;
}
int test_geom_dot_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
{
size_t sizes[] = { 1, 2, 3, 4, 0 };
unsigned int size;
int retVal = 0;
for( size = 0; sizes[ size ] != 0 ; size++ )
{
if( test_twoToFloat_kernel_double( queue, context, "dot", sizes[ size ], verifyDot_double, -1.0f /*magic value*/, d ) != 0 )
{
log_error( " dot double vector size %d FAILED\n", (int)sizes[ size ] );
retVal = -1;
}
}
return retVal;
}
int test_geom_fast_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
{
size_t sizes[] = { 1, 2, 3, 4, 0 };
unsigned int size;
int retVal = 0;
abort(); //there is no double precision fast_distance
for( size = 0; sizes[ size ] != 0 ; size++ )
{
double maxUlps = 8192.0f + // error in sqrt
0.5f * // effect on e of taking sqrt( x + e )
( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
if( test_twoToFloat_kernel_double( queue, context, "fast_distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
{
log_error( " fast_distance double vector size %d FAILED\n", (int)sizes[ size ] );
retVal = -1;
}
else
{
log_info( " fast_distance double vector size %d passed\n", (int)sizes[ size ] );
}
}
return retVal;
}
double verifyDistance_double( double *srcA, double *srcB, size_t vecSize )
{
unsigned int i;
double diff[4];
for( i = 0; i < vecSize; i++ )
diff[i] = srcA[i] - srcB[i];
return verifyLength_double( diff, vecSize );
}
int test_geom_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
{
size_t sizes[] = { 1, 2, 3, 4, 0 };
unsigned int size;
int retVal = 0;
for( size = 0; sizes[ size ] != 0 ; size++ )
{
double maxUlps = 3.0f + // error in sqrt
0.5f * // effect on e of taking sqrt( x + e )
( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
maxUlps *= 2.0; // our reference code may be in error too
if( test_twoToFloat_kernel_double( queue, context, "distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
{
log_error( " distance double vector size %d FAILED\n", (int)sizes[ size ] );
retVal = -1;
}
else
{
log_info( " distance double vector size %d passed\n", (int)sizes[ size ] );
}
}
return retVal;
}
typedef double (*oneToFloatVerifyFn_double)( double *srcA, size_t vecSize );
int test_oneToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
size_t vecSize, oneToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
{
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[2];
BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE));
int error;
size_t i, threads[1], localThreads[1];
char kernelSource[10240];
char *programPtr;
char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
cl_double *inDataA = A;
cl_double *outData = B;
/* Create the source */
sprintf( kernelSource, vecSize == 3 ? oneToFloatKernelPattern_doubleV3 : oneToFloatKernelPattern_double, sizeNames[vecSize-1], fnName );
/* Create kernels */
programPtr = kernelSource;
if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
return -1;
/* Generate some streams */
for( i = 0; i < TEST_SIZE * vecSize; i++ )
inDataA[ i ] = any_double(d);
fillWithTrickyNumbers_double( inDataA, NULL, vecSize );
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
if( streams[0] == NULL )
{
log_error("ERROR: Creating input array A failed!\n");
return -1;
}
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_double) * TEST_SIZE, NULL, NULL);
if( streams[1] == NULL )
{
log_error("ERROR: Creating output array failed!\n");
return -1;
}
/* Assign streams and execute */
error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] );
test_error( error, "Unable to set indexed kernel arguments" );
error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] );
test_error( error, "Unable to set indexed kernel arguments" );
/* Run the kernel */
threads[0] = TEST_SIZE;
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
test_error( error, "Unable to get work group size to use" );
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
test_error( error, "Unable to execute test kernel" );
/* Now get the results */
error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
test_error( error, "Unable to read output array!" );
/* And verify! */
for( i = 0; i < TEST_SIZE; i++ )
{
double expected = verifyFn( inDataA + i * vecSize, vecSize );
if( (double) expected != outData[ i ] )
{
double ulps = Ulp_Error_Double( outData[i], expected );
if( fabs( ulps ) <= ulpLimit )
continue;
// We have to special case NAN
if( isnan( outData[ i ] ) && isnan( expected ) )
continue;
if(! (fabs(ulps) < ulpLimit) )
{
log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n",
(int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], ulps );
char vecA[1000];
vector2string_double( vecA, inDataA + i * vecSize, vecSize );
log_error( "\tvector: %s", vecA );
return -1;
}
}
}
return 0;
}
double verifyLength_double( double *srcA, size_t vecSize )
{
double total = 0;
unsigned int i;
// We calculate the distance as a double, to try and make up for the fact that
// the GPU has better precision distance since it's a single op
for( i = 0; i < vecSize; i++ )
total += srcA[i] * srcA[i];
// Deal with spurious overflow
if( total == INFINITY )
{
total = 0.0;
for( i = 0; i < vecSize; i++ )
{
double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p-600, 0x1LL, -600);
total += f * f;
}
return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p600, 0x1LL, 600);
}
// Deal with spurious underflow
if( total < 4 /*max vector length*/ * DBL_MIN / DBL_EPSILON )
{
total = 0.0;
for( i = 0; i < vecSize; i++ )
{
double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700);
total += f * f;
}
return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p-700, 0x1LL, -700);
}
return sqrt( total );
}
int test_geom_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
{
size_t sizes[] = { 1, 2, 3, 4, 0 };
unsigned int size;
int retVal = 0;
for( size = 0; sizes[ size ] != 0 ; size++ )
{
double maxUlps = 3.0f + // error in sqrt
0.5f * // effect on e of taking sqrt( x + e )
( 0.5f * (double) sizes[size] + // cumulative error for multiplications
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
maxUlps *= 2.0; // our reference code may be in error too
if( test_oneToFloat_kernel_double( queue, context, "length", sizes[ size ], verifyLength_double, maxUlps, d ) != 0 )
{
log_error( " length double vector size %d FAILED\n", (int)sizes[ size ] );
retVal = -1;
}
else
{
log_info( " length double vector size %d passed\n", (int)sizes[ size ] );
}
}
return retVal;
}
double verifyFastLength_double( double *srcA, size_t vecSize )
{
double total = 0;
unsigned int i;
// We calculate the distance as a double, to try and make up for the fact that
// the GPU has better precision distance since it's a single op
for( i = 0; i < vecSize; i++ )
{
total += (double)srcA[i] * (double)srcA[i];
}
return sqrt( total );
}
int test_geom_fast_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
{
size_t sizes[] = { 1, 2, 3, 4, 0 };
unsigned int size;
int retVal = 0;
abort(); //there is no double precision fast_length
for( size = 0; sizes[ size ] != 0 ; size++ )
{
double maxUlps = 8192.0f + // error in half_sqrt
0.5f * // effect on e of taking sqrt( x + e )
( 0.5f * (double) sizes[size] + // cumulative error for multiplications
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
if( test_oneToFloat_kernel_double( queue, context, "fast_length", sizes[ size ], verifyFastLength_double, maxUlps, d ) != 0 )
{
log_error( " fast_length double vector size %d FAILED\n", (int)sizes[ size ] );
retVal = -1;
}
else
{
log_info( " fast_length double vector size %d passed\n", (int)sizes[ size ] );
}
}
return retVal;
}
typedef void (*oneToOneVerifyFn_double)( double *srcA, double *dstA, size_t vecSize );
int test_oneToOne_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
size_t vecSize, oneToOneVerifyFn_double verifyFn, double ulpLimit, MTdata d )
{
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[2];
BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
int error;
size_t i, j, threads[1], localThreads[1];
char kernelSource[10240];
char *programPtr;
char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
cl_double *inDataA = A;
cl_double *outData = B;
/* Create the source */
sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPattern_doubleV3 : oneToOneKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
/* Create kernels */
programPtr = kernelSource;
if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
return -1;
/* initialize data */
memset( inDataA, 0, vecSize * sizeof( cl_double ) );
for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
inDataA[ i ] = any_double(d);
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
if( streams[0] == NULL )
{
log_error("ERROR: Creating input array A failed!\n");
return -1;
}
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_double) * vecSize * TEST_SIZE, NULL, NULL);
if( streams[1] == NULL )
{
log_error("ERROR: Creating output array failed!\n");
return -1;
}
/* Assign streams and execute */
error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
test_error( error, "Unable to set indexed kernel arguments" );
error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
test_error( error, "Unable to set indexed kernel arguments" );
/* Run the kernel */
threads[0] = TEST_SIZE;
error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
test_error( error, "Unable to get work group size to use" );
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
test_error( error, "Unable to execute test kernel" );
/* Now get the results */
error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE * vecSize, outData, 0, NULL, NULL );
test_error( error, "Unable to read output array!" );
/* And verify! */
for( i = 0; i < TEST_SIZE; i++ )
{
double expected[4];
verifyFn( inDataA + i * vecSize, expected, vecSize );
for( j = 0; j < vecSize; j++ )
{
// We have to special case NAN
if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) )
continue;
if( expected[j] != outData[ i *vecSize+j ] )
{
double error =
Ulp_Error_Double( outData[i*vecSize + j ], expected[ j ] );
if( fabs(error) > ulpLimit )
{
log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n",
(int)i, (int)j, (int)vecSize,
expected[j], expected[j],
outData[i*vecSize +j],
outData[i*vecSize +j], error );
log_error( " Source: " );
for( size_t q = 0; q < vecSize; q++ )
log_error( "%g ", inDataA[ i * vecSize + q ] );
log_error( "\n : " );
for( size_t q = 0; q < vecSize; q++ )
log_error( "%a ", inDataA[ i * vecSize + q ] );
log_error( "\n" );
log_error( " Result: " );
for( size_t q = 0; q < vecSize; q++ )
log_error( "%g ", outData[i * vecSize + q ] );
log_error( "\n : " );
for( size_t q = 0; q < vecSize; q++ )
log_error( "%a ", outData[i * vecSize + q ] );
log_error( "\n" );
log_error( " Expected: " );
for( size_t q = 0; q < vecSize; q++ )
log_error( "%g ", expected[ q ] );
log_error( "\n : " );
for( size_t q = 0; q < vecSize; q++ )
log_error( "%a ", expected[ q ] );
log_error( "\n" );
return -1;
}
}
}
}
return 0;
}
void verifyNormalize_double( double *srcA, double *dst, size_t vecSize )
{
double total = 0, value;
unsigned int i;
// We calculate everything as a double, to try and make up for the fact that
// the GPU has better precision distance since it's a single op
for( i = 0; i < vecSize; i++ )
total += (double)srcA[i] * (double)srcA[i];
if( total < vecSize * DBL_MIN / DBL_EPSILON )
{ //we may have incurred denormalization loss -- rescale
total = 0;
for( i = 0; i < vecSize; i++ )
{
dst[i] = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700); //exact
total += dst[i] * dst[i];
}
//If still zero
if( total == 0.0 )
{
// Special edge case: copy vector over without change
for( i = 0; i < vecSize; i++ )
dst[i] = srcA[i];
return;
}
srcA = dst;
}
else if( total == INFINITY )
{ //we may have incurred spurious overflow
double scale = MAKE_HEX_DOUBLE(0x1.0p-512, 0x1LL, -512) / vecSize;
total = 0;
for( i = 0; i < vecSize; i++ )
{
dst[i] = srcA[i] * scale; //exact
total += dst[i] * dst[i];
}
// If there are infinities here, handle those
if( total == INFINITY )
{
total = 0;
for( i = 0; i < vecSize; i++ )
{
if( isinf(dst[i]) )
{
dst[i] = copysign( 1.0, srcA[i] );
total += 1.0;
}
else
dst[i] = copysign( 0.0, srcA[i] );
}
}
srcA = dst;
}
value = sqrt( total );
for( i = 0; i < vecSize; i++ )
dst[i] = srcA[i] / value;
}
int test_geom_normalize_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
{
size_t sizes[] = { 1, 2, 3, 4, 0 };
unsigned int size;
int retVal = 0;
for( size = 0; sizes[ size ] != 0 ; size++ )
{
double maxUlps = 2.5f + // error in rsqrt + error in multiply
0.5f * // effect on e of taking sqrt( x + e )
( 0.5f * (double) sizes[size] + // cumulative error for multiplications
0.5f * (double) (sizes[size]-1)); // cumulative error for additions
maxUlps *= 2.0; //our reference code is not infinitely precise and may have error of its own
if( test_oneToOne_kernel_double( queue, context, "normalize", sizes[ size ], verifyNormalize_double, maxUlps, d ) != 0 )
{
log_error( " normalize double vector size %d FAILED\n", (int)sizes[ size ] );
retVal = -1;
}
else
{
log_info( " normalize double vector size %d passed\n", (int)sizes[ size ] );
}
}
return retVal;
}