OpenCL-CTS/test_conformance/atomics/test_atomics.cpp

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "testBase.h"
#include "../../test_common/harness/conversions.h"
#ifndef _WIN32
#include <unistd.h>
#endif

#define INT_TEST_VALUE 402258822
#define LONG_TEST_VALUE 515154531254381446LL


extern cl_uint gRandomSeed;

const char *atomic_global_pattern[] = {
    "__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
    "{\n"
    "    int  tid = get_global_id(0);\n"
    "\n"
    ,
    "\n"
    "}\n" };

const char *atomic_local_pattern[] = {
    "__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
    "{\n"
    "    int  tid = get_global_id(0);\n"
    "     int  dstItemIdx;\n"
    "\n"
    "    // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
    "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
    "        destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
    "\n"
    ,
    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
    "    // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
    "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
    "        finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
    "}\n" };


#define TEST_COUNT 128 * 1024


struct TestFns
{
    cl_int    mIntStartValue;
    cl_long    mLongStartValue;

    size_t    (*NumResultsFn)( size_t threadSize, ExplicitType dataType );

    // Integer versions
    cl_int    (*ExpectedValueIntFn)( size_t size, cl_int *startRefValues, size_t whichDestValue );
    void    (*GenerateRefsIntFn)( size_t size, cl_int *startRefValues, MTdata d );
    bool    (*VerifyRefsIntFn)( size_t size, cl_int *refValues, cl_int finalValue );

    // Long versions
    cl_long    (*ExpectedValueLongFn)( size_t size, cl_long *startRefValues, size_t whichDestValue );
    void    (*GenerateRefsLongFn)( size_t size, cl_long *startRefValues, MTdata d );
    bool    (*VerifyRefsLongFn)( size_t size, cl_long *refValues, cl_long finalValue );

    // Float versions
    cl_float    (*ExpectedValueFloatFn)( size_t size, cl_float *startRefValues, size_t whichDestValue );
    void        (*GenerateRefsFloatFn)( size_t size, cl_float *startRefValues, MTdata d );
    bool        (*VerifyRefsFloatFn)( size_t size, cl_float *refValues, cl_float finalValue );
};

bool check_atomic_support( cl_device_id device, bool extended, bool isLocal, ExplicitType dataType )
{
    const char *extensionNames[8] = {
        "cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
        "cl_khr_local_int32_base_atomics",  "cl_khr_local_int32_extended_atomics",
        "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics",
        "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics"       // this line intended to be the same as the last one
    };

    size_t index = 0;
    if( extended )
        index += 1;
    if( isLocal )
        index += 2;

    size_t major, minor;

    int error = get_device_version(device, &major, &minor);
    test_error( error, "get_device_version" );

    switch (dataType)
    {
        case kInt:
        case kUInt:
            if( major * 10 + minor >= 11 )
                return 1;
            break;
        case kLong:
        case kULong:
            index += 4;
            break;
        case kFloat:  // this has to stay separate since the float atomics arent in the 1.0 extensions
            return major * 10 + minor >= 11;
        default:
            log_error( "ERROR:  Unsupported data type (%d) in check_atomic_support\n", dataType );
            return 0;
    }

    return is_extension_available( device, extensionNames[index] );
}

int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *programCore,
                         TestFns testFns,
                         bool extended, bool isLocal, ExplicitType dataType, bool matchGroupSize )
{
    clProgramWrapper program;
    clKernelWrapper kernel;
    int error;
    size_t threads[1];
    clMemWrapper streams[2];
    void *refValues, *startRefValues;
    size_t threadSize, groupSize;
    const char *programLines[4];
    char pragma[ 512 ];
    char programHeader[ 512 ];
    MTdata d;
    size_t typeSize = get_explicit_type_size( dataType );


    // Verify we can run first
    bool isUnsigned = ( dataType == kULong ) || ( dataType == kUInt );
    if( !check_atomic_support( deviceID, extended, isLocal, dataType ) )
    {
        // Only print for the signed (unsigned comes right after, and if signed isn't supported, unsigned isn't either)
        if( dataType == kFloat )
            log_info( "\t%s float not supported\n", isLocal ? "Local" : "Global" );
        else if( !isUnsigned )
            log_info( "\t%s %sint%d not supported\n", isLocal ? "Local" : "Global", isUnsigned ? "u" : "", (int)typeSize * 8 );
        // Since we don't support the operation, they implicitly pass
        return 0;
    }
    else
    {
        if( dataType == kFloat )
            log_info( "\t%s float%s...", isLocal ? "local" : "global", isLocal ? " " : "" );
        else
            log_info( "\t%s %sint%d%s%s...", isLocal ? "local" : "global", isUnsigned ? "u" : "",
                     (int)typeSize * 8, isUnsigned ? "" : " ", isLocal ? " " : "" );
    }

    //// Set up the kernel code

    // Create the pragma line for this kernel
    bool isLong = ( dataType == kLong || dataType == kULong );
    sprintf( pragma, "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
            isLong ? "" : (isLocal ? "_local" : "_global"), isLong ? "64" : "32",
            extended ? "extended" : "base" );

    // Now create the program header
    const char *typeName = get_explicit_type_name( dataType );
    if( isLocal )
        sprintf( programHeader, atomic_local_pattern[ 0 ], typeName, typeName, typeName );
    else
        sprintf( programHeader, atomic_global_pattern[ 0 ], typeName, typeName );

    // Set up our entire program now
    programLines[ 0 ] = pragma;
    programLines[ 1 ] = programHeader;
    programLines[ 2 ] = programCore;
    programLines[ 3 ] = ( isLocal ) ? atomic_local_pattern[ 1 ] : atomic_global_pattern[ 1 ];

    if( create_single_kernel_helper( context, &program, &kernel, 4, programLines, "test_atomic_fn" ) )
    {
        return -1;
    }

    //// Set up to actually run
    threadSize = 1024;
    if( threadSize == 0 )
        threadSize = TEST_COUNT;

    error = get_max_common_work_group_size( context, kernel, threadSize, &groupSize );
    test_error( error, "Unable to get thread group max size" );

    if( matchGroupSize )
        // HACK because xchg and cmpxchg apparently are limited by hardware
        threadSize = groupSize;

    if( isLocal )
    {
        size_t workSize;
        error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof( workSize ), &workSize, NULL );
        test_error( error, "Unable to obtain max work group size for device and kernel combo" );

        threadSize = groupSize = workSize;
    }


    log_info( "\t(thread count %d, group size %d)\n", (int)threadSize, (int)groupSize );

    refValues = (cl_int *)malloc( typeSize * threadSize );

    if( testFns.GenerateRefsIntFn != NULL )
    {
        // We have a ref generator provided
        d = init_genrand( gRandomSeed );
        startRefValues = malloc( typeSize * threadSize );
        if( typeSize == 4 )
            testFns.GenerateRefsIntFn( threadSize, (cl_int *)startRefValues, d );
        else
            testFns.GenerateRefsLongFn( threadSize, (cl_long *)startRefValues, d );
        free_mtdata(d);
        d = NULL;
    }
    else
        startRefValues = NULL;

    // If we're given a num_results function, we need to determine how many result objects we need. If
    // we don't have it, we assume it's just 1
    size_t numDestItems = ( testFns.NumResultsFn != NULL ) ? testFns.NumResultsFn( threadSize, dataType ) : 1;

    char * destItems = new char[ typeSize * numDestItems ];
    if( destItems == NULL )
    {
        log_error( "ERROR: Unable to allocate memory!\n" );
        return -1;
    }
    void * startValue = ( typeSize == 4 ) ? (void *)&testFns.mIntStartValue : (void *)&testFns.mLongStartValue;
    for( size_t i = 0; i < numDestItems; i++ )
        memcpy( destItems + i * typeSize, startValue, typeSize );

    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), typeSize * numDestItems, destItems, NULL);
    if (!streams[0])
    {
        log_error("ERROR: Creating output array failed!\n");
        return -1;
    }
    streams[1] = clCreateBuffer(context, (cl_mem_flags)(( startRefValues != NULL ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE )), typeSize * threadSize, startRefValues, NULL);
    if (!streams[1])
    {
        log_error("ERROR: Creating reference array failed!\n");
        return -1;
    }

    /* Set the arguments */
    error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
    test_error( error, "Unable to set indexed kernel arguments" );
    error = clSetKernelArg( kernel, 1, sizeof( streams[1] ), &streams[1] );
    test_error( error, "Unable to set indexed kernel arguments" );

    if( isLocal )
    {
        error = clSetKernelArg( kernel, 2, typeSize * numDestItems, NULL );
        test_error( error, "Unable to set indexed local kernel argument" );

        cl_int numDestItemsInt = (cl_int)numDestItems;
        error = clSetKernelArg( kernel, 3, sizeof( cl_int ), &numDestItemsInt );
        test_error( error, "Unable to set indexed kernel argument" );
    }

    /* Run the kernel */
    threads[0] = threadSize;
    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, &groupSize, 0, NULL, NULL );
    test_error( error, "Unable to execute test kernel" );

    error = clEnqueueReadBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
    test_error( error, "Unable to read result value!" );

    error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize * threadSize, refValues, 0, NULL, NULL );
    test_error( error, "Unable to read reference values!" );

    // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
    // have one, it's because we're comparing ref values only
    if( testFns.ExpectedValueIntFn != NULL )
    {
        for( size_t i = 0; i < numDestItems; i++ )
        {
            char expected[ 8 ];
            cl_int intVal;
            cl_long longVal;
            if( typeSize == 4 )
            {
                // Int version
                intVal = testFns.ExpectedValueIntFn( threadSize, (cl_int *)startRefValues, i );
                memcpy( expected, &intVal, sizeof( intVal ) );
            }
            else
            {
                // Long version
                longVal = testFns.ExpectedValueLongFn( threadSize, (cl_long *)startRefValues, i );
                memcpy( expected, &longVal, sizeof( longVal ) );
            }

            if( memcmp( expected, destItems + i * typeSize, typeSize ) != 0 )
            {
                if( typeSize == 4 )
                {
                    cl_int *outValue = (cl_int *)( destItems + i * typeSize );
                    log_error( "ERROR: Result %ld from kernel does not validate! (should be %d, was %d)\n", i, intVal, *outValue );
                    cl_int *startRefs = (cl_int *)startRefValues;
                    cl_int *refs = (cl_int *)refValues;
                    for( i = 0; i < threadSize; i++ )
                    {
                        if( startRefs != NULL )
                            log_info( " --- %ld - %d --- %d\n", i, startRefs[i], refs[i] );
                        else
                            log_info( " --- %ld --- %d\n", i, refs[i] );
                    }
                }
                else
                {
                    cl_long *outValue = (cl_long *)( destItems + i * typeSize );
                    log_error( "ERROR: Result %ld from kernel does not validate! (should be %lld, was %lld)\n", i, longVal, *outValue );
                    cl_long *startRefs = (cl_long *)startRefValues;
                    cl_long *refs = (cl_long *)refValues;
                    for( i = 0; i < threadSize; i++ )
                    {
                        if( startRefs != NULL )
                            log_info( " --- %ld - %lld --- %lld\n", i, startRefs[i], refs[i] );
                        else
                            log_info( " --- %ld --- %lld\n", i, refs[i] );
                    }
                }
                return -1;
            }
        }
    }

    if( testFns.VerifyRefsIntFn != NULL )
    {
        /* Use the verify function to also check the results */
        if( dataType == kFloat )
        {
            cl_float *outValue = (cl_float *)destItems;
            if( !testFns.VerifyRefsFloatFn( threadSize, (cl_float *)refValues, *outValue ) != 0 )
            {
                log_error( "ERROR: Reference values did not validate!\n" );
                return -1;
            }
        }
        else if( typeSize == 4 )
        {
            cl_int *outValue = (cl_int *)destItems;
            if( !testFns.VerifyRefsIntFn( threadSize, (cl_int *)refValues, *outValue ) != 0 )
            {
                log_error( "ERROR: Reference values did not validate!\n" );
                return -1;
            }
        }
        else
        {
            cl_long *outValue = (cl_long *)destItems;
            if( !testFns.VerifyRefsLongFn( threadSize, (cl_long *)refValues, *outValue ) != 0 )
            {
                log_error( "ERROR: Reference values did not validate!\n" );
                return -1;
            }
        }
    }
    else if( testFns.ExpectedValueIntFn == NULL )
    {
        log_error( "ERROR: Test doesn't check total or refs; no values are verified!\n" );
        return -1;
    }


    /* Re-write the starting value */
    for( size_t i = 0; i < numDestItems; i++ )
        memcpy( destItems + i * typeSize, startValue, typeSize );
    error = clEnqueueWriteBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
    test_error( error, "Unable to write starting values!" );

    /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
    threads[0] = 1;
    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, threads, 0, NULL, NULL );
    test_error( error, "Unable to execute test kernel" );

    error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize, refValues, 0, NULL, NULL );
    test_error( error, "Unable to read reference values!" );

    if( memcmp( refValues, destItems, typeSize ) != 0 )
    {
        if( typeSize == 4 )
        {
            cl_int *s = (cl_int *)destItems;
            cl_int *r = (cl_int *)refValues;
            log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
                      " (should have been %d, returned %d)!\n", *s, *r );
        }
        else
        {
            cl_long *s = (cl_long *)destItems;
            cl_long *r = (cl_long *)refValues;
            log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
                      " (should have been %lld, returned %lld)!\n", *s, *r );
        }
        return -1;
    }

    delete [] destItems;
    free( refValues );
    if( startRefValues != NULL )
        free( startRefValues );

    return 0;
}

int test_atomic_function_set(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *programCore,
                             TestFns testFns,
                             bool extended, bool matchGroupSize, bool usingAtomicPrefix )
{
    log_info("    Testing %s functions...\n", usingAtomicPrefix ? "atomic_" : "atom_");

    int errors = 0;
    errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, false, kInt, matchGroupSize );
    errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, false, kUInt, matchGroupSize );
    errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, true, kInt, matchGroupSize );
    errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, true, kUInt, matchGroupSize );

    // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64 bit functions still use the "atom" prefix.
    // The argument usingAtomicPrefix is set to true if programCore was generated with the "atomic" prefix.
    if (!usingAtomicPrefix) {
      errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, false, kLong, matchGroupSize );
      errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, false, kULong, matchGroupSize );
      errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, true, kLong, matchGroupSize );
      errors |= test_atomic_function( deviceID, context, queue, programCore, testFns, extended, true, kULong, matchGroupSize );
    }

    return errors;
}

#pragma mark ---- add

const char atom_add_core[] =
"    oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
"    atom_add( &destMemory[0], tid + 3 );\n"
"   atom_add( &destMemory[0], tid + 3 );\n"
"   atom_add( &destMemory[0], tid + 3 );\n";

const char atomic_add_core[] =
"    oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
"    atomic_add( &destMemory[0], tid + 3 );\n"
"   atomic_add( &destMemory[0], tid + 3 );\n"
"   atomic_add( &destMemory[0], tid + 3 );\n";

cl_int test_atomic_add_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
    cl_int total = 0;
    for( size_t i = 0; i < size; i++ )
        total += ( (cl_int)i + 3 ) * 4;
    return total;
}

cl_long test_atomic_add_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
    cl_long total = 0;
    for( size_t i = 0; i < size; i++ )
        total += ( ( i + 3 ) * 4 );
    return total;
}

int test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { 0, 0LL, NULL, test_atomic_add_result_int, NULL, NULL, test_atomic_add_result_long, NULL, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
      return -1;
    return 0;
}

#pragma mark ---- sub

const char atom_sub_core[] = "    oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";

const char atomic_sub_core[] = "    oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";

cl_int test_atomic_sub_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
    cl_int total = INT_TEST_VALUE;
    for( size_t i = 0; i < size; i++ )
        total -= (cl_int)i + 3;
    return total;
}

cl_long test_atomic_sub_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
    cl_long total = LONG_TEST_VALUE;
    for( size_t i = 0; i < size; i++ )
        total -= i + 3;
    return total;
}

int test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_sub_result_int, NULL, NULL, test_atomic_sub_result_long, NULL, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
        return -1;
    return 0;
}

#pragma mark ---- xchg

const char atom_xchg_core[] = "    oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";

const char atomic_xchg_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
const char atomic_xchg_float_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";

bool test_atomic_xchg_verify_int( size_t size, cl_int *refValues, cl_int finalValue )
{
    /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
    char *valids;
    size_t i;
    char originalValidCount = 0;

    valids = (char *)malloc( sizeof( char ) * size );
    memset( valids, 0, sizeof( char ) * size );

    for( i = 0; i < size; i++ )
    {
        if( refValues[ i ] == INT_TEST_VALUE )
        {
            // Special initial value
            originalValidCount++;
            continue;
        }
        if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
        {
            log_error( "ERROR: Reference value %ld outside of valid range! (%d)\n", i, refValues[ i ] );
            return false;
        }
        valids[ refValues[ i ] ] ++;
    }

    /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
     the final value outputted */
    if( valids[ finalValue ] > 0 )
    {
        log_error( "ERROR: Final value %d was also in ref list!\n", finalValue );
        return false;
    }
    else
        valids[ finalValue ] = 1;    // So the following loop will be okay

    /* Now check that every entry has one and only one count */
    if( originalValidCount != 1 )
    {
        log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
        return false;
    }
    for( i = 0; i < size; i++ )
    {
        if( valids[ i ] != 1 )
        {
            log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
            for( size_t j = 0; j < size; j++ )
                log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
            return false;
        }
    }

    free( valids );
    return true;
}

bool test_atomic_xchg_verify_long( size_t size, cl_long *refValues, cl_long finalValue )
{
    /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
    char *valids;
    size_t i;
    char originalValidCount = 0;

    valids = (char *)malloc( sizeof( char ) * size );
    memset( valids, 0, sizeof( char ) * size );

    for( i = 0; i < size; i++ )
    {
        if( refValues[ i ] == LONG_TEST_VALUE )
        {
            // Special initial value
            originalValidCount++;
            continue;
        }
        if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
        {
            log_error( "ERROR: Reference value %ld outside of valid range! (%lld)\n", i, refValues[ i ] );
            return false;
        }
        valids[ refValues[ i ] ] ++;
    }

    /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
     the final value outputted */
    if( valids[ finalValue ] > 0 )
    {
        log_error( "ERROR: Final value %lld was also in ref list!\n", finalValue );
        return false;
    }
    else
        valids[ finalValue ] = 1;    // So the following loop will be okay

    /* Now check that every entry has one and only one count */
    if( originalValidCount != 1 )
    {
        log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
        return false;
    }
    for( i = 0; i < size; i++ )
    {
        if( valids[ i ] != 1 )
        {
            log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
            for( size_t j = 0; j < size; j++ )
                log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
            return false;
        }
    }

    free( valids );
    return true;
}

bool test_atomic_xchg_verify_float( size_t size, cl_float *refValues, cl_float finalValue )
{
    /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
    char *valids;
    size_t i;
    char originalValidCount = 0;

    valids = (char *)malloc( sizeof( char ) * size );
    memset( valids, 0, sizeof( char ) * size );

    for( i = 0; i < size; i++ )
    {
        cl_int *intRefValue = (cl_int *)( &refValues[ i ] );
        if( *intRefValue == INT_TEST_VALUE )
        {
            // Special initial value
            originalValidCount++;
            continue;
        }
        if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
        {
            log_error( "ERROR: Reference value %ld outside of valid range! (%a)\n", i, refValues[ i ] );
            return false;
        }
        valids[ (int)refValues[ i ] ] ++;
    }

    /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
     the final value outputted */
    if( valids[ (int)finalValue ] > 0 )
    {
        log_error( "ERROR: Final value %a was also in ref list!\n", finalValue );
        return false;
    }
    else
        valids[ (int)finalValue ] = 1;    // So the following loop will be okay

    /* Now check that every entry has one and only one count */
    if( originalValidCount != 1 )
    {
        log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
        return false;
    }
    for( i = 0; i < size; i++ )
    {
        if( valids[ i ] != 1 )
        {
            log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
            for( size_t j = 0; j < size; j++ )
                log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
            return false;
        }
    }

    free( valids );
    return true;
}

int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, NULL, NULL, test_atomic_xchg_verify_int, NULL, NULL, test_atomic_xchg_verify_long, NULL, NULL, test_atomic_xchg_verify_float };

    int errors = test_atomic_function_set( deviceID, context, queue, atom_xchg_core, set, false, true, /*usingAtomicPrefix*/ false  );
    errors |= test_atomic_function_set( deviceID, context, queue, atomic_xchg_core, set, false, true, /*usingAtomicPrefix*/ true  );

    errors |= test_atomic_function( deviceID, context, queue, atomic_xchg_float_core, set, false, false, kFloat, true );
    errors |= test_atomic_function( deviceID, context, queue, atomic_xchg_float_core, set, false, true, kFloat, true );

    return errors;
}


#pragma mark ---- min

const char atom_min_core[] = "    oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";

const char atomic_min_core[] = "    oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";

cl_int test_atomic_min_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
    cl_int total = 0x7fffffffL;
    for( size_t i = 0; i < size; i++ )
    {
        if( startRefValues[ i ] < total )
            total = startRefValues[ i ];
    }
    return total;
}

void test_atomic_min_gen_int( size_t size, cl_int *startRefValues, MTdata d )
{
    for( size_t i = 0; i < size; i++ )
        startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
}

cl_long test_atomic_min_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
    cl_long total = 0x7fffffffffffffffLL;
    for( size_t i = 0; i < size; i++ )
    {
        if( startRefValues[ i ] < total )
            total = startRefValues[ i ];
    }
    return total;
}

void test_atomic_min_gen_long( size_t size, cl_long *startRefValues, MTdata d )
{
    for( size_t i = 0; i < size; i++ )
        startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
}

int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { 0x7fffffffL, 0x7fffffffffffffffLL, NULL, test_atomic_min_result_int, test_atomic_min_gen_int, NULL, test_atomic_min_result_long, test_atomic_min_gen_long, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
        return -1;
    return 0;
}


#pragma mark ---- max

const char atom_max_core[] = "    oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";

const char atomic_max_core[] = "    oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";

cl_int test_atomic_max_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
    cl_int total = 0;
    for( size_t i = 0; i < size; i++ )
    {
        if( startRefValues[ i ] > total )
            total = startRefValues[ i ];
    }
    return total;
}

void test_atomic_max_gen_int( size_t size, cl_int *startRefValues, MTdata d )
{
    for( size_t i = 0; i < size; i++ )
        startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
}

cl_long test_atomic_max_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
    cl_long total = 0;
    for( size_t i = 0; i < size; i++ )
    {
        if( startRefValues[ i ] > total )
            total = startRefValues[ i ];
    }
    return total;
}

void test_atomic_max_gen_long( size_t size, cl_long *startRefValues, MTdata d )
{
    for( size_t i = 0; i < size; i++ )
        startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
}

int test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { 0, 0, NULL, test_atomic_max_result_int, test_atomic_max_gen_int, NULL, test_atomic_max_result_long, test_atomic_max_gen_long, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
      return -1;
    return 0;
}


#pragma mark ---- inc

const char atom_inc_core[] = "    oldValues[tid] = atom_inc( &destMemory[0] );\n";

const char atomic_inc_core[] = "    oldValues[tid] = atomic_inc( &destMemory[0] );\n";

cl_int test_atomic_inc_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
    return INT_TEST_VALUE + (cl_int)size;
}

cl_long test_atomic_inc_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
    return LONG_TEST_VALUE + size;
}

int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_inc_result_int, NULL, NULL, test_atomic_inc_result_long, NULL, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
        return -1;
    return 0;
}


#pragma mark ---- dec

const char atom_dec_core[] = "    oldValues[tid] = atom_dec( &destMemory[0] );\n";

const char atomic_dec_core[] = "    oldValues[tid] = atomic_dec( &destMemory[0] );\n";

cl_int test_atomic_dec_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
    return INT_TEST_VALUE - (cl_int)size;
}

cl_long test_atomic_dec_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
    return LONG_TEST_VALUE - size;
}

int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_dec_result_int, NULL, NULL, test_atomic_dec_result_long, NULL, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
        return -1;
    return 0;
}


#pragma mark ---- cmpxchg

/* We test cmpxchg by implementing (the long way) atom_add */
const char atom_cmpxchg_core[] =
"    int oldValue, origValue, newValue;\n"
"    do { \n"
"        origValue = destMemory[0];\n"
"        newValue = origValue + tid + 2;\n"
"        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
"    } while( oldValue != origValue );\n"
"    oldValues[tid] = oldValue;\n"
;

const char atom_cmpxchg64_core[] =
"    long oldValue, origValue, newValue;\n"
"    do { \n"
"        origValue = destMemory[0];\n"
"        newValue = origValue + tid + 2;\n"
"        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
"    } while( oldValue != origValue );\n"
"    oldValues[tid] = oldValue;\n"
;

const char atomic_cmpxchg_core[] =
"    int oldValue, origValue, newValue;\n"
"    do { \n"
"        origValue = destMemory[0];\n"
"        newValue = origValue + tid + 2;\n"
"        oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
"    } while( oldValue != origValue );\n"
"    oldValues[tid] = oldValue;\n"
;

cl_int test_atomic_cmpxchg_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
    cl_int total = INT_TEST_VALUE;
    for( size_t i = 0; i < size; i++ )
        total += (cl_int)i + 2;
    return total;
}

cl_long test_atomic_cmpxchg_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
    cl_long total = LONG_TEST_VALUE;
    for( size_t i = 0; i < size; i++ )
        total += i + 2;
    return total;
}

int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_cmpxchg_result_int, NULL, NULL, test_atomic_cmpxchg_result_long, NULL, NULL };

    int errors = 0;

    log_info("    Testing atom_ functions...\n");
    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg_core, set, false, false, kInt, true );
    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg_core, set, false, false, kUInt, true );
    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg_core, set, false, true, kInt, true );
    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg_core, set, false, true, kUInt, true );

    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg64_core, set, false, false, kLong, true );
    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg64_core, set, false, false, kULong, true );
    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg64_core, set, false, true, kLong, true );
    errors |= test_atomic_function( deviceID, context, queue, atom_cmpxchg64_core, set, false, true, kULong, true );

    log_info("    Testing atomic_ functions...\n");
    errors |= test_atomic_function( deviceID, context, queue, atomic_cmpxchg_core, set, false, false, kInt, true );
    errors |= test_atomic_function( deviceID, context, queue, atomic_cmpxchg_core, set, false, false, kUInt, true );
    errors |= test_atomic_function( deviceID, context, queue, atomic_cmpxchg_core, set, false, true, kInt, true );
    errors |= test_atomic_function( deviceID, context, queue, atomic_cmpxchg_core, set, false, true, kUInt, true );

    if( errors )
        return -1;

    return 0;
}

#pragma mark -------- Bitwise functions

size_t test_bitwise_num_results( size_t threadCount, ExplicitType dataType )
{
    size_t numBits = get_explicit_type_size( dataType ) * 8;

    return ( threadCount + numBits - 1 ) / numBits;
}

#pragma mark ---- and

const char atom_and_core[] =
"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
"    int  whichResult = tid / numBits;\n"
"    int  bitIndex = tid - ( whichResult * numBits );\n"
"\n"
"    oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
;

const char atomic_and_core[] =
"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
"    int  whichResult = tid / numBits;\n"
"    int  bitIndex = tid - ( whichResult * numBits );\n"
"\n"
"    oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
;


cl_int test_atomic_and_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
{
    size_t numThreads = ( (size_t)size + 31 ) / 32;
    if( whichResult < numThreads - 1 )
        return 0;

    // Last item doesn't get and'ed on every bit, so we have to mask away
    size_t numBits = (size_t)size - whichResult * 32;
    cl_int bits = (cl_int)0xffffffffL;
    for( size_t i = 0; i < numBits; i++ )
        bits &= ~( 1 << i );

    return bits;
}

cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
{
    size_t numThreads = ( (size_t)size + 63 ) / 64;
    if( whichResult < numThreads - 1 )
        return 0;

    // Last item doesn't get and'ed on every bit, so we have to mask away
    size_t numBits = (size_t)size - whichResult * 64;
    cl_long bits = (cl_long)0xffffffffffffffffLL;
    for( size_t i = 0; i < numBits; i++ )
        bits &= ~( 1 << i );

    return bits;
}

int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { 0xffffffff, 0xffffffffffffffffLL, test_bitwise_num_results,
        test_atomic_and_result_int, NULL, NULL, test_atomic_and_result_long, NULL, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
        return -1;
    return 0;
}


#pragma mark ---- or

const char atom_or_core[] =
"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
"    int  whichResult = tid / numBits;\n"
"    int  bitIndex = tid - ( whichResult * numBits );\n"
"\n"
"    oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
;

const char atomic_or_core[] =
"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
"    int  whichResult = tid / numBits;\n"
"    int  bitIndex = tid - ( whichResult * numBits );\n"
"\n"
"    oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
;

cl_int test_atomic_or_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
{
    size_t numThreads = ( (size_t)size + 31 ) / 32;
    if( whichResult < numThreads - 1 )
        return 0xffffffff;

    // Last item doesn't get and'ed on every bit, so we have to mask away
    size_t numBits = (size_t)size - whichResult * 32;
    cl_int bits = 0;
    for( size_t i = 0; i < numBits; i++ )
        bits |= ( 1 << i );

    return bits;
}

cl_long test_atomic_or_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
{
    size_t numThreads = ( (size_t)size + 63 ) / 64;
    if( whichResult < numThreads - 1 )
        return 0x0ffffffffffffffffLL;

    // Last item doesn't get and'ed on every bit, so we have to mask away
    size_t numBits = (size_t)size - whichResult * 64;
    cl_long bits = 0;
    for( size_t i = 0; i < numBits; i++ )
        bits |= ( 1LL << i );

    return bits;
}

int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int, NULL, NULL, test_atomic_or_result_long, NULL, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
        return -1;
    return 0;
}


#pragma mark ---- xor

const char atom_xor_core[] =
"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
"    int  bitIndex = tid & ( numBits - 1 );\n"
"\n"
"    oldValues[tid] = atom_xor( &destMemory[0], 1 << bitIndex );\n"
;

const char atomic_xor_core[] =
"    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
"    int  bitIndex = tid & ( numBits - 1 );\n"
"\n"
"    oldValues[tid] = atomic_xor( &destMemory[0], 1 << bitIndex );\n"
;

cl_int test_atomic_xor_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
{
    cl_int total = 0x2f08ab41;
    for( size_t i = 0; i < size; i++ )
        total ^= ( 1 << ( i & 31 ) );
    return total;
}

cl_long test_atomic_xor_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
{
    cl_long total = 0x2f08ab418ba0541LL;
    for( size_t i = 0; i < size; i++ )
        total ^= ( 1LL << ( i & 63 ) );
    return total;
}

int test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    TestFns set = { 0x2f08ab41, 0x2f08ab418ba0541LL, NULL, test_atomic_xor_result_int, NULL, NULL, test_atomic_xor_result_long, NULL, NULL };

    if( test_atomic_function_set( deviceID, context, queue, atom_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
        return -1;
    if( test_atomic_function_set( deviceID, context, queue, atomic_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
        return -1;
    return 0;
}