Files
OpenCL-CTS/test_conformance/basic/test_vloadstore.c
Kevin Petit d8733efc0f Synchronise with Khronos-private Gitlab branch
The maintenance of the conformance tests is moving to Github.

This commit contains all the changes that have been done in
Gitlab since the first public release of the conformance tests.

Signed-off-by: Kevin Petit <kevin.petit@arm.com>
2019-03-05 16:23:49 +00:00

996 lines
42 KiB
C

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#if !defined(_WIN32)
#include <stdint.h>
#endif
#include <string.h>
#if !defined(_WIN32)
#include <stdbool.h>
#endif
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "procs.h"
#include "../../test_common/harness/conversions.h"
#include "../../test_common/harness/typeWrappers.h"
#include "../../test_common/harness/errorHelpers.h"
// Outputs debug information for stores
#define DEBUG 0
// Forces stores/loads to be done with offsets = tid
#define LINEAR_OFFSETS 0
#define NUM_LOADS 512
static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
#pragma mark -------------------- vload harness --------------------------
typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize );
int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
create_vload_program_fn createFn, size_t bufferSize, MTdata d )
{
int error;
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[ 4 ];
const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
size_t threads[ 1 ], localThreads[ 1 ];
clProtectedArray inBuffer( bufferSize );
char programSrc[ 10240 ];
cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
size_t numElements, typeSize, i;
unsigned int outVectorSize;
typeSize = get_explicit_type_size( type );
numElements = bufferSize / ( typeSize * vecSize );
bufferSize = numElements * typeSize * vecSize; // To account for rounding
if (DEBUG) log_info("Testing: numLoads: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numLoads, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
// Create some random input data and random offsets to load from
generate_random_data( type, numElements * vecSize, d, (void *)inBuffer );
for( i = 0; i < numLoads; i++ )
{
offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 1, d );
if( offsets[ i ] < numElements - 2 )
alignmentOffsets[ i ] = (cl_uint)random_in_range( 0, (int)vecSize - 1, d );
else
alignmentOffsets[ i ] = 0;
if (LINEAR_OFFSETS) offsets[i] = (cl_uint)i;
}
if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n");
// 32-bit fixup
outVectorSize = vecSize;
// Declare output buffers now
#if !(defined(_WIN32) && defined(_MSC_VER))
char outBuffer[ numLoads * typeSize * outVectorSize ];
char referenceBuffer[ numLoads * typeSize * vecSize ];
#else
char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char));
char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char));
#endif
// Create the program
createFn( programSrc, numElements, type, vecSize, outVectorSize);
// Create our kernel
const char *ptr = programSrc;
error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
test_error( error, "Unable to create testing kernel" );
if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
// Get the number of args to differentiate the kernels with local storage. (They have 5)
cl_uint numArgs;
error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
test_error( error, "clGetKernelInfo failed");
// Set up parameters
streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, bufferSize, (void *)inBuffer, &error );
test_error( error, "Unable to create kernel stream" );
streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(offsets[0]), offsets, &error );
test_error( error, "Unable to create kernel stream" );
streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
test_error( error, "Unable to create kernel stream" );
streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error );
test_error( error, "Unable to create kernel stream" );
// Set parameters and run
if (numArgs == 5) {
// We need to set the size of the local storage
error = clSetKernelArg(kernel, 0, bufferSize, NULL);
test_error( error, "clSetKernelArg for buffer failed");
for( i = 0; i < 4; i++ )
{
error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
test_error( error, "Unable to set kernel argument" );
}
} else {
// No local storage
for( i = 0; i < 4; i++ )
{
error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
test_error( error, "Unable to set kernel argument" );
}
}
threads[ 0 ] = numLoads;
error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
test_error( error, "Unable to get local thread size" );
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
test_error( error, "Unable to exec kernel" );
// Get the results
error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL );
test_error( error, "Unable to read results" );
// Create the reference results
memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char));
for( i = 0; i < numLoads; i++ )
{
memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize,
typeSize * vecSize );
}
// Validate the results now
char *expected = referenceBuffer;
char *actual = outBuffer;
char *in = (char *)(void *)inBuffer;
if (DEBUG) {
log_info("Memory contents:\n");
for (i=0; i<numElements; i++) {
char inString[1024];
char expectedString[ 1024 ], actualString[ 1024 ];
if (i < numLoads) {
log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ),
offsets[i], alignmentOffsets[i]);
if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*outVectorSize]), typeSize * vecSize) != 0)
log_error(" << ERROR\n");
else
log_info("\n");
} else {
log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ));
}
}
}
for( i = 0; i < numLoads; i++ )
{
if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
{
char expectedString[ 1024 ], actualString[ 1024 ];
log_error( "ERROR: Data sample %d for vload of %s%d did not validate (expected {%s}, got {%s}, loaded from offset %d)\n",
(int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
GetDataVectorString( actual, typeSize, vecSize, actualString ), (int)offsets[ i ] );
return 1;
}
expected += typeSize * vecSize;
actual += typeSize * outVectorSize;
}
return 0;
}
int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queue, create_vload_program_fn createFn, size_t bufferSize )
{
ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
const char *size_names[] = { "2", "3", "4", "8", "16"};
unsigned int typeIdx, sizeIdx;
int error = 0;
MTdata mtData = init_genrand( gRandomSeed );
log_info("Testing with buffer size of %d.\n", (int)bufferSize);
for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
{
if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
continue;
if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
continue;
for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
{
log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData );
if (error_this_type) {
error += error_this_type;
log_error("Failure; skipping further sizes for this type.");
break;
}
}
}
free_mtdata(mtData);
return error;
}
#pragma mark -------------------- vload test cases --------------------------
void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
{
const char *pattern =
"%s%s"
"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
" results[ tid ] = tmp;\n"
"}\n";
const char *patternV3 =
"%s%s"
"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
" results[ 3*tid ] = tmp.s0;\n"
" results[ 3*tid+1 ] = tmp.s1;\n"
" results[ 3*tid+2 ] = tmp.s2;\n"
"}\n";
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
sprintf( destBuffer, patternV3,
type == kDouble ? doubleExtensionPragma : "",
"",
typeName, typeName, typeName, typeName );
} else {
sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "",
"",
typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
(int)inVectorSize, typeName );
}
}
int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
return test_vloadset( device, context, queue, create_global_load_code, 10240 );
}
void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
{
const char *pattern =
"%s%s"
//" __local %s%d sSharedStorage[ %d ];\n"
"__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" int lid = get_local_id( 0 );\n"
"\n"
" if( lid == 0 )\n"
" {\n"
" for( int i = 0; i < %d; i++ )\n"
" sSharedStorage[ i ] = src[ i ];\n"
" }\n"
// Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
// threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
// updated on all threads at that point
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
" %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
" results[ tid ] = tmp;\n"
"}\n";
const char *patternV3 =
"%s%s"
//" __local %s%d sSharedStorage[ %d ];\n"
"__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" int lid = get_local_id( 0 );\n"
"\n"
" if( lid == 0 )\n"
" {\n"
" for( int i = 0; i < %d; i++ ) {\n"
" sSharedStorage[ 3*i ] = src[ 3*i ];\n"
" sSharedStorage[ 3*i +1] = src[ 3*i +1];\n"
" sSharedStorage[ 3*i +2] = src[ 3*i +2];\n"
" }\n"
" }\n"
// Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
// threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
// updated on all threads at that point
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
" %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
" results[ 3*tid ] = tmp.s0;\n"
" results[ 3*tid +1] = tmp.s1;\n"
" results[ 3*tid +2] = tmp.s2;\n"
"}\n";
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
sprintf( destBuffer, patternV3,
type == kDouble ? doubleExtensionPragma : "",
"",
typeName, /*(int)inBufferSize,*/
typeName, typeName,
(int)inBufferSize,
typeName, typeName );
} else {
sprintf( destBuffer, pattern,
type == kDouble ? doubleExtensionPragma : "",
"",
typeName, (int)inVectorSize, /*(int)inBufferSize,*/
typeName, (int)inVectorSize, typeName, (int)outVectorSize,
(int)inBufferSize,
typeName, (int)inVectorSize, (int)inVectorSize, typeName );
}
}
int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
// Determine the max size of a local buffer that we can test against
cl_ulong localSize;
int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
test_error( error, "Unable to get max size of local memory buffer" );
if( localSize > 10240 )
localSize = 10240;
if (localSize > 4096)
localSize -= 2048;
else
localSize /= 2;
return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize );
}
void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
{
const char *pattern =
"%s%s"
"__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
" results[ tid ] = tmp;\n"
"}\n";
const char *patternV3 =
"%s%s"
"__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
" results[ 3*tid ] = tmp.s0;\n"
" results[ 3*tid+1 ] = tmp.s1;\n"
" results[ 3*tid+2 ] = tmp.s2;\n"
"}\n";
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
sprintf( destBuffer, patternV3,
type == kDouble ? doubleExtensionPragma : "",
"",
typeName, typeName, typeName,
typeName );
} else {
sprintf( destBuffer, pattern,
type == kDouble ? doubleExtensionPragma : "",
"",
typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
(int)inVectorSize, typeName );
}
}
int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
// Determine the max size of a local buffer that we can test against
cl_ulong maxSize;
int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
test_error( error, "Unable to get max size of constant memory buffer" );
if( maxSize > 10240 )
maxSize = 10240;
if (maxSize > 4096)
maxSize -= 2048;
else
maxSize /= 2;
return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize );
}
void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
{
const char *pattern =
"%s%s"
// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
"#define PRIV_TYPE %s%d\n"
"#define PRIV_SIZE %d\n"
"__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
"{\n"
" __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
" int tid = get_global_id( 0 );\n"
"\n"
" for( int i = 0; i < %d; i++ )\n"
" sPrivateStorage[ i ] = src[ i ];\n"
// Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
// anybody else to sync up
"\n"
" %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
" results[ tid ] = tmp;\n"
"}\n";
const char *patternV3 =
"%s%s"
// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
"#define PRIV_TYPE %s\n"
"#define PRIV_SIZE %d\n"
"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
"{\n"
" __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
" int tid = get_global_id( 0 );\n"
"\n"
" for( int i = 0; i < PRIV_SIZE; i++ )\n"
" {\n"
" sPrivateStorage[ i ] = src[ i ];\n"
" }\n"
// Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
// anybody else to sync up
"\n"
" %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
" results[ 3*tid ] = tmp.s0;\n"
" results[ 3*tid+1 ] = tmp.s1;\n"
" results[ 3*tid+2 ] = tmp.s2;\n"
"}\n";
const char *typeName = get_explicit_type_name(type);
if(inVectorSize ==3) {
sprintf( destBuffer, patternV3,
type == kDouble ? doubleExtensionPragma : "",
"",
typeName, 3*((int)inBufferSize),
typeName, typeName,
typeName );
// log_info("Src is \"\n%s\n\"\n", destBuffer);
} else {
sprintf( destBuffer, pattern,
type == kDouble ? doubleExtensionPragma : "",
"",
typeName, (int)inVectorSize, (int)inBufferSize,
typeName, (int)inVectorSize, typeName, (int)outVectorSize,
(int)inBufferSize,
typeName, (int)inVectorSize, (int)inVectorSize, typeName );
}
}
int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
// We have no idea how much actual private storage is available, so just pick a reasonable value,
// which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
return test_vloadset( device, context, queue, create_private_load_code, 256 );
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma mark -------------------- vstore harness --------------------------
typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize );
int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
create_vstore_program_fn createFn, size_t bufferSize, MTdata d )
{
int error;
clProgramWrapper program;
clKernelWrapper kernel;
clMemWrapper streams[ 3 ];
size_t threads[ 1 ], localThreads[ 1 ];
size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
if (DEBUG)
bufferSize = (bufferSize < 128) ? bufferSize : 128;
typeSize = get_explicit_type_size( type );
numElements = bufferSize / ( typeSize * vecSize );
bufferSize = numElements * typeSize * vecSize; // To account for rounding
if( numStores > numElements * 2 / 3 )
{
// Note: unlike load, we have to restrict the # of stores here, since all offsets must be unique for our test
// (Plus, we leave some room for extra values to make sure didn't get written)
numStores = numElements * 2 / 3;
if( numStores < 1 )
numStores = 1;
}
if (DEBUG)
log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
#if !(defined(_WIN32) && defined(_MSC_VER))
cl_uint offsets[ numStores ];
#else
cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint));
#endif
char programSrc[ 10240 ];
size_t i;
#if !(defined(_WIN32) && defined(_MSC_VER))
char inBuffer[ numStores * typeSize * vecSize ];
#else
char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char));
#endif
clProtectedArray outBuffer( numElements * typeSize * vecSize );
#if !(defined(_WIN32) && defined(_MSC_VER))
char referenceBuffer[ numElements * typeSize * vecSize ];
#else
char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char));
#endif
// Create some random input data and random offsets to load from
generate_random_data( type, numStores * vecSize, d, (void *)inBuffer );
// Note: make sure no two offsets are the same, otherwise the output would depend on
// the order that threads ran in, and that would be next to impossible to verify
#if !(defined(_WIN32) && defined(_MSC_VER))
char flags[ numElements ];
#else
char* flags = (char*)_malloca( numElements * sizeof(char));
#endif
memset( flags, 0, numElements * sizeof(char) );
for( i = 0; i < numStores; i++ )
{
do
{
offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 2, d ); // Note: keep it one vec below the end for offset testing
} while( flags[ offsets[ i ] ] != 0 );
flags[ offsets[ i ] ] = -1;
if (LINEAR_OFFSETS)
offsets[i] = (int)i;
}
if (LINEAR_OFFSETS)
log_info("Offsets set to thread IDs to simplify output.\n");
createFn( programSrc, numElements, type, vecSize );
// Create our kernel
const char *ptr = programSrc;
error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
test_error( error, "Unable to create testing kernel" );
if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
// Get the number of args to differentiate the kernels with local storage. (They have 5)
cl_uint numArgs;
error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
test_error( error, "clGetKernelInfo failed");
// Set up parameters
streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error );
test_error( error, "Unable to create kernel stream" );
streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error );
test_error( error, "Unable to create kernel stream" );
streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
test_error( error, "Unable to create kernel stream" );
// Set parameters and run
if (numArgs == 5)
{
// We need to set the size of the local storage
error = clSetKernelArg(kernel, 0, bufferSize, NULL);
test_error( error, "clSetKernelArg for buffer failed");
for( i = 0; i < 3; i++ )
{
error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
test_error( error, "Unable to set kernel argument" );
}
}
else
{
// No local storage
for( i = 0; i < 3; i++ )
{
error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
if (error)
log_info("%s\n", programSrc);
test_error( error, "Unable to set kernel argument" );
}
}
threads[ 0 ] = numStores;
error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
test_error( error, "Unable to get local thread size" );
// Run in a loop, changing the address offset from 0 to ( vecSize - 1 ) each time, since
// otherwise stores might overlap each other, and it'd be a nightmare to test!
for( cl_uint addressOffset = 0; addressOffset < vecSize; addressOffset++ )
{
if (DEBUG)
log_info("\tstore addressOffset is %d, executing with threads %d\n", addressOffset, (int)threads[0]);
// Clear the results first
memset( outBuffer, 0, numElements * typeSize * vecSize );
error = clEnqueueWriteBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
test_error( error, "Unable to erase result stream" );
// Set up the new offset and run
if (numArgs == 5)
error = clSetKernelArg( kernel, 3+1, sizeof( cl_uint ), &addressOffset );
else
error = clSetKernelArg( kernel, 3, sizeof( cl_uint ), &addressOffset );
test_error( error, "Unable to set address offset argument" );
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
test_error( error, "Unable to exec kernel" );
// Get the results
error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
test_error( error, "Unable to read results" );
// Create the reference results
memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) );
for( i = 0; i < numStores; i++ )
{
memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize );
}
// Validate the results now
char *expected = referenceBuffer;
char *actual = (char *)(void *)outBuffer;
if (DEBUG)
{
log_info("Memory contents:\n");
for (i=0; i<numElements; i++)
{
char inString[1024];
char expectedString[ 1024 ], actualString[ 1024 ];
if (i < numStores)
{
log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ),
offsets[i]);
if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*vecSize]), typeSize * vecSize) != 0)
log_error(" << ERROR\n");
else
log_info("\n");
}
else
{
log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ));
}
}
}
for( i = 0; i < numElements; i++ )
{
if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
{
char expectedString[ 1024 ], actualString[ 1024 ];
log_error( "ERROR: Data sample %d for vstore of %s%d did not validate (expected {%s}, got {%s}",
(int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
GetDataVectorString( actual, typeSize, vecSize, actualString ) );
size_t j;
for( j = 0; j < numStores; j++ )
{
if( offsets[ j ] == (cl_uint)i )
{
log_error( ", stored from store #%d (of %d, offset = %d) with address offset of %d", (int)j, (int)numStores, offsets[j], (int)addressOffset );
break;
}
}
if( j == numStores )
log_error( ", supposed to be canary value" );
log_error( ")\n" );
return 1;
}
expected += typeSize * vecSize;
actual += typeSize * vecSize;
}
}
return 0;
}
int test_vstoreset(cl_device_id device, cl_context context, cl_command_queue queue, create_vstore_program_fn createFn, size_t bufferSize )
{
ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
const char *size_names[] = { "2", "3", "4", "8", "16"};
unsigned int typeIdx, sizeIdx;
int error = 0;
MTdata d = init_genrand( gRandomSeed );
log_info("Testing with buffer size of %d.\n", (int)bufferSize);
for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
{
if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
continue;
if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
continue;
for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
{
log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
int error_this_type = test_vstore( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, d );
if (error_this_type)
{
log_error("Failure; skipping further sizes for this type.\n");
error += error_this_type;
break;
}
}
}
free_mtdata(d);
return error;
}
#pragma mark -------------------- vstore test cases --------------------------
void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
{
const char *pattern =
"%s"
"__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
"}\n";
const char *patternV3 =
"%s"
"__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
" if((tid&3) == 0) { // if \"tid\" is a multiple of 4 \n"
" vstore3( srcValues[ 3*(tid>>2) ], offsets[ tid ], destBuffer + alignmentOffset );\n"
" } else {\n"
" vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
" }\n"
"}\n";
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
sprintf( destBuffer, patternV3,
type == kDouble ? doubleExtensionPragma : "",
typeName, typeName, typeName);
} else {
sprintf( destBuffer, pattern,
type == kDouble ? doubleExtensionPragma : "",
typeName, (int)inVectorSize, typeName, (int)inVectorSize );
}
// if(inVectorSize == 3 || inVectorSize == 4) {
// log_info("\n----\n%s\n----\n", destBuffer);
// }
}
int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
return test_vstoreset( device, context, queue, create_global_store_code, 10240 );
}
void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
{
const char *pattern =
"%s"
"\n"
"__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
" sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n"
" sSharedStorage[ offsets[tid] +1 ] = sSharedStorage[ offsets[tid] ];\n"
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
" vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n"
"\n"
// Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
// buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
// Note: we only copy the relevant portion of our local storage over to the dest buffer, because
// otherwise, local threads would be overwriting results from other local threads
" int i;\n"
" __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
" __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
" for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
" dp[i] = sp[i];\n"
"}\n";
const char *patternV3 =
"%s"
"\n"
"__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
"{\n"
" int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
" sSharedStorage[ 3*offsets[tid] ] = (%s)0;\n"
" sSharedStorage[ 3*offsets[tid] +1 ] = \n"
" sSharedStorage[ 3*offsets[tid] ];\n"
" sSharedStorage[ 3*offsets[tid] +2 ] = \n"
" sSharedStorage[ 3*offsets[tid]];\n"
" sSharedStorage[ 3*offsets[tid] +3 ] = \n"
" sSharedStorage[ 3*offsets[tid]];\n"
" sSharedStorage[ 3*offsets[tid] +4 ] = \n"
" sSharedStorage[ 3*offsets[tid] ];\n"
" sSharedStorage[ 3*offsets[tid] +5 ] = \n"
" sSharedStorage[ 3*offsets[tid]];\n"
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
" vstore3( vload3(tid,srcValues), offsets[ tid ], sSharedStorage + alignmentOffset );\n"
"\n"
// Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
// buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
// Note: we only copy the relevant portion of our local storage over to the dest buffer, because
// otherwise, local threads would be overwriting results from other local threads
" int i;\n"
" __local %s *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
" __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
" for( i = 0; i < 3; i++ ) \n"
" dp[i] = sp[i];\n"
"}\n";
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
sprintf( destBuffer, patternV3,
type == kDouble ? doubleExtensionPragma : "",
typeName,
typeName,
typeName, typeName,
typeName, typeName, typeName );
} else {
sprintf( destBuffer, pattern,
type == kDouble ? doubleExtensionPragma : "",
typeName, (int)inVectorSize,
typeName, (int)inVectorSize, typeName, (int)inVectorSize,
typeName, (int)inVectorSize, typeName,
(int)inVectorSize, typeName, typeName,
typeName, typeName, typeName );
}
// log_info(destBuffer);
}
int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
// Determine the max size of a local buffer that we can test against
cl_ulong localSize;
int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
test_error( error, "Unable to get max size of local memory buffer" );
if( localSize > 10240 )
localSize = 10240;
if (localSize > 4096)
localSize -= 2048;
else
localSize /= 2;
return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize );
}
void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
{
const char *pattern =
"%s"
// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
"\n"
"__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
"{\n"
" __private %s%d sPrivateStorage[ %d ];\n"
" int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
" sPrivateStorage[tid] = (%s%d)(%s)0;\n"
"\n"
" vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
"\n"
// Note: we only copy the relevant portion of our local storage over to the dest buffer, because
// otherwise, local threads would be overwriting results from other local threads
" uint i;\n"
" __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
" __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
" for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
" dp[i] = sp[i];\n"
"}\n";
const char *patternV3 =
"%s"
// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
"\n"
"__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n"
"{\n"
" __private %s3 sPrivateStorage[ %d ];\n" // keep this %d
" int tid = get_global_id( 0 );\n"
// We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
" sPrivateStorage[tid] = (%s3)(%s)0;\n"
"\n"
" vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
"\n"
// Note: we only copy the relevant portion of our local storage over to the dest buffer, because
// otherwise, local threads would be overwriting results from other local threads
" uint i;\n"
" __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
" __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
" for( i = 0; i < 3; i++ ) \n"
" dp[i] = sp[i];\n"
"}\n";
const char *typeName = get_explicit_type_name(type);
if(inVectorSize == 3) {
sprintf( destBuffer, patternV3,
type == kDouble ? doubleExtensionPragma : "",
typeName, typeName,
typeName, (int)inBufferSize,
typeName, typeName,
typeName, typeName, typeName, typeName, typeName );
} else {
sprintf( destBuffer, pattern,
type == kDouble ? doubleExtensionPragma : "",
typeName, (int)inVectorSize, typeName, (int)inVectorSize,
typeName, (int)inVectorSize, (int)inBufferSize,
typeName, (int)inVectorSize, typeName,
(int)inVectorSize, typeName, typeName, typeName, typeName, typeName );
}
}
int test_vstore_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
{
// We have no idea how much actual private storage is available, so just pick a reasonable value,
// which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
return test_vstoreset( device, context, queue, create_private_store_code, 256 );
}