Files
OpenCL-CTS/test_conformance/atomics/test_atomics.cpp
Sven van Haastregt 9bf6486352 [NFC] clang-format test_atomics (#1516)
Add some clang-format off/on comments to keep lists and kernel code
readable.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
2022-09-27 09:32:23 -07:00

1438 lines
49 KiB
C++

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "testBase.h"
#include "harness/conversions.h"
#ifndef _WIN32
#include <unistd.h>
#endif
#define INT_TEST_VALUE 402258822
#define LONG_TEST_VALUE 515154531254381446LL
// clang-format off
const char *atomic_global_pattern[] = {
"__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
"\n"
,
"\n"
"}\n" };
const char *atomic_local_pattern[] = {
"__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
"{\n"
" int tid = get_global_id(0);\n"
" int dstItemIdx;\n"
"\n"
" // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
" for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
" destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
,
" barrier( CLK_LOCAL_MEM_FENCE );\n"
" // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
" for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
" finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
"}\n" };
// clang-format on
#define TEST_COUNT 128 * 1024
struct TestFns
{
cl_int mIntStartValue;
cl_long mLongStartValue;
size_t (*NumResultsFn)(size_t threadSize, ExplicitType dataType);
// Integer versions
cl_int (*ExpectedValueIntFn)(size_t size, cl_int *startRefValues,
size_t whichDestValue);
void (*GenerateRefsIntFn)(size_t size, cl_int *startRefValues, MTdata d);
bool (*VerifyRefsIntFn)(size_t size, cl_int *refValues, cl_int finalValue);
// Long versions
cl_long (*ExpectedValueLongFn)(size_t size, cl_long *startRefValues,
size_t whichDestValue);
void (*GenerateRefsLongFn)(size_t size, cl_long *startRefValues, MTdata d);
bool (*VerifyRefsLongFn)(size_t size, cl_long *refValues,
cl_long finalValue);
// Float versions
cl_float (*ExpectedValueFloatFn)(size_t size, cl_float *startRefValues,
size_t whichDestValue);
void (*GenerateRefsFloatFn)(size_t size, cl_float *startRefValues,
MTdata d);
bool (*VerifyRefsFloatFn)(size_t size, cl_float *refValues,
cl_float finalValue);
};
bool check_atomic_support(cl_device_id device, bool extended, bool isLocal,
ExplicitType dataType)
{
// clang-format off
const char *extensionNames[8] = {
"cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
"cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics",
"cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics",
"cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics" // this line intended to be the same as the last one
};
// clang-format on
size_t index = 0;
if (extended) index += 1;
if (isLocal) index += 2;
Version version = get_device_cl_version(device);
switch (dataType)
{
case kInt:
case kUInt:
if (version >= Version(1, 1)) return 1;
break;
case kLong:
case kULong: index += 4; break;
case kFloat: // this has to stay separate since the float atomics arent
// in the 1.0 extensions
return version >= Version(1, 1);
default:
log_error(
"ERROR: Unsupported data type (%d) in check_atomic_support\n",
dataType);
return 0;
}
return is_extension_available(device, extensionNames[index]);
}
int test_atomic_function(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements,
const char *programCore, TestFns testFns,
bool extended, bool isLocal, ExplicitType dataType,
bool matchGroupSize)
{
clProgramWrapper program;
clKernelWrapper kernel;
int error;
size_t threads[1];
clMemWrapper streams[2];
void *refValues, *startRefValues;
size_t threadSize, groupSize;
const char *programLines[4];
char pragma[512];
char programHeader[512];
MTdata d;
size_t typeSize = get_explicit_type_size(dataType);
// Verify we can run first
bool isUnsigned = (dataType == kULong) || (dataType == kUInt);
if (!check_atomic_support(deviceID, extended, isLocal, dataType))
{
// Only print for the signed (unsigned comes right after, and if signed
// isn't supported, unsigned isn't either)
if (dataType == kFloat)
log_info("\t%s float not supported\n",
isLocal ? "Local" : "Global");
else if (!isUnsigned)
log_info("\t%s %sint%d not supported\n",
isLocal ? "Local" : "Global", isUnsigned ? "u" : "",
(int)typeSize * 8);
// Since we don't support the operation, they implicitly pass
return 0;
}
else
{
if (dataType == kFloat)
log_info("\t%s float%s...", isLocal ? "local" : "global",
isLocal ? " " : "");
else
log_info("\t%s %sint%d%s%s...", isLocal ? "local" : "global",
isUnsigned ? "u" : "", (int)typeSize * 8,
isUnsigned ? "" : " ", isLocal ? " " : "");
}
//// Set up the kernel code
// Create the pragma line for this kernel
bool isLong = (dataType == kLong || dataType == kULong);
sprintf(pragma,
"#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
isLong ? "" : (isLocal ? "_local" : "_global"),
isLong ? "64" : "32", extended ? "extended" : "base");
// Now create the program header
const char *typeName = get_explicit_type_name(dataType);
if (isLocal)
sprintf(programHeader, atomic_local_pattern[0], typeName, typeName,
typeName);
else
sprintf(programHeader, atomic_global_pattern[0], typeName, typeName);
// Set up our entire program now
programLines[0] = pragma;
programLines[1] = programHeader;
programLines[2] = programCore;
programLines[3] =
(isLocal) ? atomic_local_pattern[1] : atomic_global_pattern[1];
if (create_single_kernel_helper(context, &program, &kernel, 4, programLines,
"test_atomic_fn"))
{
return -1;
}
//// Set up to actually run
threadSize = num_elements;
error =
get_max_common_work_group_size(context, kernel, threadSize, &groupSize);
test_error(error, "Unable to get thread group max size");
if (matchGroupSize)
// HACK because xchg and cmpxchg apparently are limited by hardware
threadSize = groupSize;
if (isLocal)
{
size_t maxSizes[3] = { 0, 0, 0 };
error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
3 * sizeof(size_t), maxSizes, 0);
test_error(error,
"Unable to obtain max work item sizes for the device");
size_t workSize;
error = clGetKernelWorkGroupInfo(kernel, deviceID,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof(workSize), &workSize, NULL);
test_error(
error,
"Unable to obtain max work group size for device and kernel combo");
// Limit workSize to avoid extremely large local buffer size and slow
// run.
if (workSize > 65536) workSize = 65536;
// "workSize" is limited to that of the first dimension as only a
// 1DRange is executed.
if (maxSizes[0] < workSize)
{
workSize = maxSizes[0];
}
threadSize = groupSize = workSize;
}
log_info("\t(thread count %d, group size %d)\n", (int)threadSize,
(int)groupSize);
refValues = (cl_int *)malloc(typeSize * threadSize);
if (testFns.GenerateRefsIntFn != NULL)
{
// We have a ref generator provided
d = init_genrand(gRandomSeed);
startRefValues = malloc(typeSize * threadSize);
if (typeSize == 4)
testFns.GenerateRefsIntFn(threadSize, (cl_int *)startRefValues, d);
else
testFns.GenerateRefsLongFn(threadSize, (cl_long *)startRefValues,
d);
free_mtdata(d);
d = NULL;
}
else
startRefValues = NULL;
// If we're given a num_results function, we need to determine how many
// result objects we need. If we don't have it, we assume it's just 1
size_t numDestItems = (testFns.NumResultsFn != NULL)
? testFns.NumResultsFn(threadSize, dataType)
: 1;
char *destItems = new char[typeSize * numDestItems];
if (destItems == NULL)
{
log_error("ERROR: Unable to allocate memory!\n");
return -1;
}
void *startValue = (typeSize == 4) ? (void *)&testFns.mIntStartValue
: (void *)&testFns.mLongStartValue;
for (size_t i = 0; i < numDestItems; i++)
memcpy(destItems + i * typeSize, startValue, typeSize);
streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
typeSize * numDestItems, destItems, NULL);
if (!streams[0])
{
log_error("ERROR: Creating output array failed!\n");
return -1;
}
streams[1] = clCreateBuffer(
context,
((startRefValues != NULL ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
typeSize * threadSize, startRefValues, NULL);
if (!streams[1])
{
log_error("ERROR: Creating reference array failed!\n");
return -1;
}
/* Set the arguments */
error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
test_error(error, "Unable to set indexed kernel arguments");
error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
test_error(error, "Unable to set indexed kernel arguments");
if (isLocal)
{
error = clSetKernelArg(kernel, 2, typeSize * numDestItems, NULL);
test_error(error, "Unable to set indexed local kernel argument");
cl_int numDestItemsInt = (cl_int)numDestItems;
error = clSetKernelArg(kernel, 3, sizeof(cl_int), &numDestItemsInt);
test_error(error, "Unable to set indexed kernel argument");
}
/* Run the kernel */
threads[0] = threadSize;
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, &groupSize,
0, NULL, NULL);
test_error(error, "Unable to execute test kernel");
error =
clEnqueueReadBuffer(queue, streams[0], true, 0, typeSize * numDestItems,
destItems, 0, NULL, NULL);
test_error(error, "Unable to read result value!");
error =
clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize * threadSize,
refValues, 0, NULL, NULL);
test_error(error, "Unable to read reference values!");
// If we have an expectedFn, then we need to generate a final value to
// compare against. If we don't have one, it's because we're comparing ref
// values only
if (testFns.ExpectedValueIntFn != NULL)
{
for (size_t i = 0; i < numDestItems; i++)
{
char expected[8];
cl_int intVal;
cl_long longVal;
if (typeSize == 4)
{
// Int version
intVal = testFns.ExpectedValueIntFn(
threadSize, (cl_int *)startRefValues, i);
memcpy(expected, &intVal, sizeof(intVal));
}
else
{
// Long version
longVal = testFns.ExpectedValueLongFn(
threadSize, (cl_long *)startRefValues, i);
memcpy(expected, &longVal, sizeof(longVal));
}
if (memcmp(expected, destItems + i * typeSize, typeSize) != 0)
{
if (typeSize == 4)
{
cl_int *outValue = (cl_int *)(destItems + i * typeSize);
log_error("ERROR: Result %ld from kernel does not "
"validate! (should be %d, was %d)\n",
i, intVal, *outValue);
cl_int *startRefs = (cl_int *)startRefValues;
cl_int *refs = (cl_int *)refValues;
for (i = 0; i < threadSize; i++)
{
if (startRefs != NULL)
log_info(" --- %ld - %d --- %d\n", i, startRefs[i],
refs[i]);
else
log_info(" --- %ld --- %d\n", i, refs[i]);
}
}
else
{
cl_long *outValue = (cl_long *)(destItems + i * typeSize);
log_error("ERROR: Result %ld from kernel does not "
"validate! (should be %lld, was %lld)\n",
i, longVal, *outValue);
cl_long *startRefs = (cl_long *)startRefValues;
cl_long *refs = (cl_long *)refValues;
for (i = 0; i < threadSize; i++)
{
if (startRefs != NULL)
log_info(" --- %ld - %lld --- %lld\n", i,
startRefs[i], refs[i]);
else
log_info(" --- %ld --- %lld\n", i, refs[i]);
}
}
return -1;
}
}
}
if (testFns.VerifyRefsIntFn != NULL)
{
/* Use the verify function to also check the results */
if (dataType == kFloat)
{
cl_float *outValue = (cl_float *)destItems;
if (!testFns.VerifyRefsFloatFn(threadSize, (cl_float *)refValues,
*outValue)
!= 0)
{
log_error("ERROR: Reference values did not validate!\n");
return -1;
}
}
else if (typeSize == 4)
{
cl_int *outValue = (cl_int *)destItems;
if (!testFns.VerifyRefsIntFn(threadSize, (cl_int *)refValues,
*outValue)
!= 0)
{
log_error("ERROR: Reference values did not validate!\n");
return -1;
}
}
else
{
cl_long *outValue = (cl_long *)destItems;
if (!testFns.VerifyRefsLongFn(threadSize, (cl_long *)refValues,
*outValue)
!= 0)
{
log_error("ERROR: Reference values did not validate!\n");
return -1;
}
}
}
else if (testFns.ExpectedValueIntFn == NULL)
{
log_error("ERROR: Test doesn't check total or refs; no values are "
"verified!\n");
return -1;
}
/* Re-write the starting value */
for (size_t i = 0; i < numDestItems; i++)
memcpy(destItems + i * typeSize, startValue, typeSize);
error =
clEnqueueWriteBuffer(queue, streams[0], true, 0,
typeSize * numDestItems, destItems, 0, NULL, NULL);
test_error(error, "Unable to write starting values!");
/* Run the kernel once for a single thread, so we can verify that the
* returned value is the original one */
threads[0] = 1;
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, threads, 0,
NULL, NULL);
test_error(error, "Unable to execute test kernel");
error = clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize, refValues,
0, NULL, NULL);
test_error(error, "Unable to read reference values!");
if (memcmp(refValues, destItems, typeSize) != 0)
{
if (typeSize == 4)
{
cl_int *s = (cl_int *)destItems;
cl_int *r = (cl_int *)refValues;
log_error("ERROR: atomic function operated correctly but did NOT "
"return correct 'old' value "
" (should have been %d, returned %d)!\n",
*s, *r);
}
else
{
cl_long *s = (cl_long *)destItems;
cl_long *r = (cl_long *)refValues;
log_error("ERROR: atomic function operated correctly but did NOT "
"return correct 'old' value "
" (should have been %lld, returned %lld)!\n",
*s, *r);
}
return -1;
}
delete[] destItems;
free(refValues);
if (startRefValues != NULL) free(startRefValues);
return 0;
}
int test_atomic_function_set(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements,
const char *programCore, TestFns testFns,
bool extended, bool matchGroupSize,
bool usingAtomicPrefix)
{
log_info(" Testing %s functions...\n",
usingAtomicPrefix ? "atomic_" : "atom_");
int errors = 0;
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, false, kInt,
matchGroupSize);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, false, kUInt,
matchGroupSize);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, true, kInt,
matchGroupSize);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, true, kUInt,
matchGroupSize);
// Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64
// bit functions still use the "atom" prefix. The argument usingAtomicPrefix
// is set to true if programCore was generated with the "atomic" prefix.
if (!usingAtomicPrefix)
{
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, false,
kLong, matchGroupSize);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, false,
kULong, matchGroupSize);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, true,
kLong, matchGroupSize);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
programCore, testFns, extended, true,
kULong, matchGroupSize);
}
return errors;
}
#pragma mark ---- add
const char atom_add_core[] =
" oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
" atom_add( &destMemory[0], tid + 3 );\n"
" atom_add( &destMemory[0], tid + 3 );\n"
" atom_add( &destMemory[0], tid + 3 );\n";
const char atomic_add_core[] =
" oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
" atomic_add( &destMemory[0], tid + 3 );\n"
" atomic_add( &destMemory[0], tid + 3 );\n"
" atomic_add( &destMemory[0], tid + 3 );\n";
cl_int test_atomic_add_result_int(size_t size, cl_int *startRefValues,
size_t whichDestValue)
{
cl_int total = 0;
for (size_t i = 0; i < size; i++) total += ((cl_int)i + 3) * 4;
return total;
}
cl_long test_atomic_add_result_long(size_t size, cl_long *startRefValues,
size_t whichDestValue)
{
cl_long total = 0;
for (size_t i = 0; i < size; i++) total += ((i + 3) * 4);
return total;
}
int test_atomic_add(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { 0,
0LL,
NULL,
test_atomic_add_result_int,
NULL,
NULL,
test_atomic_add_result_long,
NULL,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_add_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_add_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- sub
const char atom_sub_core[] =
" oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
const char atomic_sub_core[] =
" oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
cl_int test_atomic_sub_result_int(size_t size, cl_int *startRefValues,
size_t whichDestValue)
{
cl_int total = INT_TEST_VALUE;
for (size_t i = 0; i < size; i++) total -= (cl_int)i + 3;
return total;
}
cl_long test_atomic_sub_result_long(size_t size, cl_long *startRefValues,
size_t whichDestValue)
{
cl_long total = LONG_TEST_VALUE;
for (size_t i = 0; i < size; i++) total -= i + 3;
return total;
}
int test_atomic_sub(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE,
LONG_TEST_VALUE,
NULL,
test_atomic_sub_result_int,
NULL,
NULL,
test_atomic_sub_result_long,
NULL,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_sub_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_sub_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- xchg
const char atom_xchg_core[] =
" oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
const char atomic_xchg_core[] =
" oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
const char atomic_xchg_float_core[] =
" oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
bool test_atomic_xchg_verify_int(size_t size, cl_int *refValues,
cl_int finalValue)
{
/* For xchg, each value from 0 to size - 1 should have an entry in the ref
* array, and ONLY one entry */
char *valids;
size_t i;
char originalValidCount = 0;
valids = (char *)malloc(sizeof(char) * size);
memset(valids, 0, sizeof(char) * size);
for (i = 0; i < size; i++)
{
if (refValues[i] == INT_TEST_VALUE)
{
// Special initial value
originalValidCount++;
continue;
}
if (refValues[i] < 0 || (size_t)refValues[i] >= size)
{
log_error(
"ERROR: Reference value %ld outside of valid range! (%d)\n", i,
refValues[i]);
return false;
}
valids[refValues[i]]++;
}
/* Note: ONE entry will have zero count. It'll be the last one that
executed, because that value should be the final value outputted */
if (valids[finalValue] > 0)
{
log_error("ERROR: Final value %d was also in ref list!\n", finalValue);
return false;
}
else
valids[finalValue] = 1; // So the following loop will be okay
/* Now check that every entry has one and only one count */
if (originalValidCount != 1)
{
log_error("ERROR: Starting reference value %d did not occur "
"once-and-only-once (occurred %d)\n",
65191, originalValidCount);
return false;
}
for (i = 0; i < size; i++)
{
if (valids[i] != 1)
{
log_error("ERROR: Reference value %ld did not occur "
"once-and-only-once (occurred %d)\n",
i, valids[i]);
for (size_t j = 0; j < size; j++)
log_info("%d: %d\n", (int)j, (int)valids[j]);
return false;
}
}
free(valids);
return true;
}
bool test_atomic_xchg_verify_long(size_t size, cl_long *refValues,
cl_long finalValue)
{
/* For xchg, each value from 0 to size - 1 should have an entry in the ref
* array, and ONLY one entry */
char *valids;
size_t i;
char originalValidCount = 0;
valids = (char *)malloc(sizeof(char) * size);
memset(valids, 0, sizeof(char) * size);
for (i = 0; i < size; i++)
{
if (refValues[i] == LONG_TEST_VALUE)
{
// Special initial value
originalValidCount++;
continue;
}
if (refValues[i] < 0 || (size_t)refValues[i] >= size)
{
log_error(
"ERROR: Reference value %ld outside of valid range! (%lld)\n",
i, refValues[i]);
return false;
}
valids[refValues[i]]++;
}
/* Note: ONE entry will have zero count. It'll be the last one that
executed, because that value should be the final value outputted */
if (valids[finalValue] > 0)
{
log_error("ERROR: Final value %lld was also in ref list!\n",
finalValue);
return false;
}
else
valids[finalValue] = 1; // So the following loop will be okay
/* Now check that every entry has one and only one count */
if (originalValidCount != 1)
{
log_error("ERROR: Starting reference value %d did not occur "
"once-and-only-once (occurred %d)\n",
65191, originalValidCount);
return false;
}
for (i = 0; i < size; i++)
{
if (valids[i] != 1)
{
log_error("ERROR: Reference value %ld did not occur "
"once-and-only-once (occurred %d)\n",
i, valids[i]);
for (size_t j = 0; j < size; j++)
log_info("%d: %d\n", (int)j, (int)valids[j]);
return false;
}
}
free(valids);
return true;
}
bool test_atomic_xchg_verify_float(size_t size, cl_float *refValues,
cl_float finalValue)
{
/* For xchg, each value from 0 to size - 1 should have an entry in the ref
* array, and ONLY one entry */
char *valids;
size_t i;
char originalValidCount = 0;
valids = (char *)malloc(sizeof(char) * size);
memset(valids, 0, sizeof(char) * size);
for (i = 0; i < size; i++)
{
cl_int *intRefValue = (cl_int *)(&refValues[i]);
if (*intRefValue == INT_TEST_VALUE)
{
// Special initial value
originalValidCount++;
continue;
}
if (refValues[i] < 0 || (size_t)refValues[i] >= size)
{
log_error(
"ERROR: Reference value %ld outside of valid range! (%a)\n", i,
refValues[i]);
return false;
}
valids[(int)refValues[i]]++;
}
/* Note: ONE entry will have zero count. It'll be the last one that
executed, because that value should be the final value outputted */
if (valids[(int)finalValue] > 0)
{
log_error("ERROR: Final value %a was also in ref list!\n", finalValue);
return false;
}
else
valids[(int)finalValue] = 1; // So the following loop will be okay
/* Now check that every entry has one and only one count */
if (originalValidCount != 1)
{
log_error("ERROR: Starting reference value %d did not occur "
"once-and-only-once (occurred %d)\n",
65191, originalValidCount);
return false;
}
for (i = 0; i < size; i++)
{
if (valids[i] != 1)
{
log_error("ERROR: Reference value %ld did not occur "
"once-and-only-once (occurred %d)\n",
i, valids[i]);
for (size_t j = 0; j < size; j++)
log_info("%d: %d\n", (int)j, (int)valids[j]);
return false;
}
}
free(valids);
return true;
}
int test_atomic_xchg(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE,
LONG_TEST_VALUE,
NULL,
NULL,
NULL,
test_atomic_xchg_verify_int,
NULL,
NULL,
test_atomic_xchg_verify_long,
NULL,
NULL,
test_atomic_xchg_verify_float };
int errors = test_atomic_function_set(
deviceID, context, queue, num_elements, atom_xchg_core, set, false,
true, /*usingAtomicPrefix*/ false);
errors |= test_atomic_function_set(deviceID, context, queue, num_elements,
atomic_xchg_core, set, false, true,
/*usingAtomicPrefix*/ true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atomic_xchg_float_core, set, false, false,
kFloat, true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atomic_xchg_float_core, set, false, true,
kFloat, true);
return errors;
}
#pragma mark ---- min
const char atom_min_core[] =
" oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
const char atomic_min_core[] =
" oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
cl_int test_atomic_min_result_int(size_t size, cl_int *startRefValues,
size_t whichDestValue)
{
cl_int total = 0x7fffffffL;
for (size_t i = 0; i < size; i++)
{
if (startRefValues[i] < total) total = startRefValues[i];
}
return total;
}
void test_atomic_min_gen_int(size_t size, cl_int *startRefValues, MTdata d)
{
for (size_t i = 0; i < size; i++)
startRefValues[i] =
(cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
}
cl_long test_atomic_min_result_long(size_t size, cl_long *startRefValues,
size_t whichDestValue)
{
cl_long total = 0x7fffffffffffffffLL;
for (size_t i = 0; i < size; i++)
{
if (startRefValues[i] < total) total = startRefValues[i];
}
return total;
}
void test_atomic_min_gen_long(size_t size, cl_long *startRefValues, MTdata d)
{
for (size_t i = 0; i < size; i++)
startRefValues[i] =
(cl_long)(genrand_int32(d)
| (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
}
int test_atomic_min(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { 0x7fffffffL,
0x7fffffffffffffffLL,
NULL,
test_atomic_min_result_int,
test_atomic_min_gen_int,
NULL,
test_atomic_min_result_long,
test_atomic_min_gen_long,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_min_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_min_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- max
const char atom_max_core[] =
" oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
const char atomic_max_core[] =
" oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
cl_int test_atomic_max_result_int(size_t size, cl_int *startRefValues,
size_t whichDestValue)
{
cl_int total = 0;
for (size_t i = 0; i < size; i++)
{
if (startRefValues[i] > total) total = startRefValues[i];
}
return total;
}
void test_atomic_max_gen_int(size_t size, cl_int *startRefValues, MTdata d)
{
for (size_t i = 0; i < size; i++)
startRefValues[i] =
(cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
}
cl_long test_atomic_max_result_long(size_t size, cl_long *startRefValues,
size_t whichDestValue)
{
cl_long total = 0;
for (size_t i = 0; i < size; i++)
{
if (startRefValues[i] > total) total = startRefValues[i];
}
return total;
}
void test_atomic_max_gen_long(size_t size, cl_long *startRefValues, MTdata d)
{
for (size_t i = 0; i < size; i++)
startRefValues[i] =
(cl_long)(genrand_int32(d)
| (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
}
int test_atomic_max(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { 0,
0,
NULL,
test_atomic_max_result_int,
test_atomic_max_gen_int,
NULL,
test_atomic_max_result_long,
test_atomic_max_gen_long,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_max_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_max_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- inc
const char atom_inc_core[] =
" oldValues[tid] = atom_inc( &destMemory[0] );\n";
const char atomic_inc_core[] =
" oldValues[tid] = atomic_inc( &destMemory[0] );\n";
cl_int test_atomic_inc_result_int(size_t size, cl_int *startRefValues,
size_t whichDestValue)
{
return INT_TEST_VALUE + (cl_int)size;
}
cl_long test_atomic_inc_result_long(size_t size, cl_long *startRefValues,
size_t whichDestValue)
{
return LONG_TEST_VALUE + size;
}
int test_atomic_inc(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE,
LONG_TEST_VALUE,
NULL,
test_atomic_inc_result_int,
NULL,
NULL,
test_atomic_inc_result_long,
NULL,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_inc_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_inc_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- dec
const char atom_dec_core[] =
" oldValues[tid] = atom_dec( &destMemory[0] );\n";
const char atomic_dec_core[] =
" oldValues[tid] = atomic_dec( &destMemory[0] );\n";
cl_int test_atomic_dec_result_int(size_t size, cl_int *startRefValues,
size_t whichDestValue)
{
return INT_TEST_VALUE - (cl_int)size;
}
cl_long test_atomic_dec_result_long(size_t size, cl_long *startRefValues,
size_t whichDestValue)
{
return LONG_TEST_VALUE - size;
}
int test_atomic_dec(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE,
LONG_TEST_VALUE,
NULL,
test_atomic_dec_result_int,
NULL,
NULL,
test_atomic_dec_result_long,
NULL,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_dec_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_dec_core, set, false,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- cmpxchg
/* We test cmpxchg by implementing (the long way) atom_add */
// clang-format off
const char atom_cmpxchg_core[] =
" int oldValue, origValue, newValue;\n"
" do { \n"
" origValue = destMemory[0];\n"
" newValue = origValue + tid + 2;\n"
" oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
" } while( oldValue != origValue );\n"
" oldValues[tid] = oldValue;\n";
const char atom_cmpxchg64_core[] =
" long oldValue, origValue, newValue;\n"
" do { \n"
" origValue = destMemory[0];\n"
" newValue = origValue + tid + 2;\n"
" oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
" } while( oldValue != origValue );\n"
" oldValues[tid] = oldValue;\n";
const char atomic_cmpxchg_core[] =
" int oldValue, origValue, newValue;\n"
" do { \n"
" origValue = destMemory[0];\n"
" newValue = origValue + tid + 2;\n"
" oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
" } while( oldValue != origValue );\n"
" oldValues[tid] = oldValue;\n";
// clang-format on
cl_int test_atomic_cmpxchg_result_int(size_t size, cl_int *startRefValues,
size_t whichDestValue)
{
cl_int total = INT_TEST_VALUE;
for (size_t i = 0; i < size; i++) total += (cl_int)i + 2;
return total;
}
cl_long test_atomic_cmpxchg_result_long(size_t size, cl_long *startRefValues,
size_t whichDestValue)
{
cl_long total = LONG_TEST_VALUE;
for (size_t i = 0; i < size; i++) total += i + 2;
return total;
}
int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE,
LONG_TEST_VALUE,
NULL,
test_atomic_cmpxchg_result_int,
NULL,
NULL,
test_atomic_cmpxchg_result_long,
NULL,
NULL };
int errors = 0;
log_info(" Testing atom_ functions...\n");
errors |=
test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg_core, set, false, false, kInt, true);
errors |=
test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg_core, set, false, false, kUInt, true);
errors |=
test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg_core, set, false, true, kInt, true);
errors |=
test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg_core, set, false, true, kUInt, true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg64_core, set, false, false,
kLong, true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg64_core, set, false, false,
kULong, true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg64_core, set, false, true, kLong,
true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atom_cmpxchg64_core, set, false, true,
kULong, true);
log_info(" Testing atomic_ functions...\n");
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atomic_cmpxchg_core, set, false, false, kInt,
true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atomic_cmpxchg_core, set, false, false,
kUInt, true);
errors |=
test_atomic_function(deviceID, context, queue, num_elements,
atomic_cmpxchg_core, set, false, true, kInt, true);
errors |= test_atomic_function(deviceID, context, queue, num_elements,
atomic_cmpxchg_core, set, false, true, kUInt,
true);
if (errors) return -1;
return 0;
}
#pragma mark -------- Bitwise functions
size_t test_bitwise_num_results(size_t threadCount, ExplicitType dataType)
{
size_t numBits = get_explicit_type_size(dataType) * 8;
return (threadCount + numBits - 1) / numBits;
}
#pragma mark ---- and
// clang-format off
const char atom_and_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
const char atomic_and_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
// clang-format on
cl_int test_atomic_and_result_int(size_t size, cl_int *startRefValues,
size_t whichResult)
{
size_t numThreads = ((size_t)size + 31) / 32;
if (whichResult < numThreads - 1) return 0;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 32;
cl_int bits = (cl_int)0xffffffffL;
for (size_t i = 0; i < numBits; i++) bits &= ~(1 << i);
return bits;
}
cl_long test_atomic_and_result_long(size_t size, cl_long *startRefValues,
size_t whichResult)
{
size_t numThreads = ((size_t)size + 63) / 64;
if (whichResult < numThreads - 1) return 0;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 64;
cl_long bits = (cl_long)0xffffffffffffffffLL;
for (size_t i = 0; i < numBits; i++) bits &= ~(1LL << i);
return bits;
}
int test_atomic_and(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { 0xffffffff,
0xffffffffffffffffLL,
test_bitwise_num_results,
test_atomic_and_result_int,
NULL,
NULL,
test_atomic_and_result_long,
NULL,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_and_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_and_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- or
// clang-format off
const char atom_or_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
const char atomic_or_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
// clang-format on
cl_int test_atomic_or_result_int(size_t size, cl_int *startRefValues,
size_t whichResult)
{
size_t numThreads = ((size_t)size + 31) / 32;
if (whichResult < numThreads - 1) return 0xffffffff;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 32;
cl_int bits = 0;
for (size_t i = 0; i < numBits; i++) bits |= (1 << i);
return bits;
}
cl_long test_atomic_or_result_long(size_t size, cl_long *startRefValues,
size_t whichResult)
{
size_t numThreads = ((size_t)size + 63) / 64;
if (whichResult < numThreads - 1) return 0x0ffffffffffffffffLL;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 64;
cl_long bits = 0;
for (size_t i = 0; i < numBits; i++) bits |= (1LL << i);
return bits;
}
int test_atomic_or(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = {
0, 0LL, test_bitwise_num_results, test_atomic_or_result_int,
NULL, NULL, test_atomic_or_result_long, NULL,
NULL
};
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_or_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_or_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}
#pragma mark ---- xor
const char atom_xor_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int bitIndex = tid & ( numBits - 1 );\n"
"\n"
" oldValues[tid] = atom_xor( &destMemory[0], 1L << bitIndex );\n";
const char atomic_xor_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int bitIndex = tid & ( numBits - 1 );\n"
"\n"
" oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n";
cl_int test_atomic_xor_result_int(size_t size, cl_int *startRefValues,
size_t whichResult)
{
cl_int total = 0x2f08ab41;
for (size_t i = 0; i < size; i++) total ^= (1 << (i & 31));
return total;
}
cl_long test_atomic_xor_result_long(size_t size, cl_long *startRefValues,
size_t whichResult)
{
cl_long total = 0x2f08ab418ba0541LL;
for (size_t i = 0; i < size; i++) total ^= (1LL << (i & 63));
return total;
}
int test_atomic_xor(cl_device_id deviceID, cl_context context,
cl_command_queue queue, int num_elements)
{
TestFns set = { 0x2f08ab41,
0x2f08ab418ba0541LL,
NULL,
test_atomic_xor_result_int,
NULL,
NULL,
test_atomic_xor_result_long,
NULL,
NULL };
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atom_xor_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
!= 0)
return -1;
if (test_atomic_function_set(
deviceID, context, queue, num_elements, atomic_xor_core, set, true,
/*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
!= 0)
return -1;
return 0;
}