//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "../../test_common/harness/compat.h"
#include "../../test_common/harness/rounding_mode.h"
#include "../../test_common/harness/ThreadPool.h"
#include "../../test_common/harness/parseParameters.h"
#if defined (_WIN32)
#define MAX(x,y) ((x>y)?x:y);
#define MIN(x,y) ((x<y)?x:y);
#endif
#if !defined(_WIN32)
#include <sys/sysctl.h>
#endif

#if defined( __linux__ )
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/sysctl.h>
#endif
#if defined(__linux__)
#include <sys/param.h>
#include <libgen.h>
#endif

#include "mingw_compat.h"
#if defined(__MINGW32__)
#include <sys/param.h>
#endif

#include <stdarg.h>
#include <stdio.h>
#if !defined(_WIN32)
#include <stdint.h>
#endif

#include <stdlib.h>
#include <math.h>
#include <string.h>
#if !defined(_WIN32)
#include <libgen.h>
#include <sys/mman.h>
#endif
#include <time.h>

#include "Sleep.h"
#include "basic_test_conversions.h"

#pragma STDC FENV_ACCESS on
#if !(defined(_WIN32) && defined(_MSC_VER))
#include <fenv.h>
#endif

#if (defined(_WIN32) && defined (_MSC_VER))
// need for _controlfp_s and rouinding modes in RoundingMode
#include <float.h>
#include "../../test_common/harness/testHarness.h"
#endif

#pragma mark -
#pragma mark globals

#define BUFFER_SIZE     (1024*1024)
#define kPageSize       4096
#define PERF_LOOP_COUNT 100

#define      kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)

const char **   argList = NULL;
int             argCount = 0;
cl_device_type  gDeviceType = CL_DEVICE_TYPE_DEFAULT;
cl_device_id    gDevice = NULL;
cl_context      gContext = NULL;
cl_command_queue      gQueue = NULL;
char            appName[64] = "ctest";
int             gTestCount = 0;
int             gFailCount = 0;
int             gStartTestNumber = -1;
int             gEndTestNumber = 0;
#if defined( __APPLE__ )
int             gTimeResults = 1;
#else
int             gTimeResults = 0;
#endif
int             gReportAverageTimes = 0;
void            *gIn = NULL;
void            *gRef = NULL;
void        *gAllowZ = NULL;
void            *gOut[ kCallStyleCount ] = { NULL };
cl_mem          gInBuffer;
cl_mem          gOutBuffers[ kCallStyleCount ];
size_t          gComputeDevices = 0;
uint32_t        gDeviceFrequency = 0;
int             gWimpyMode = 0;
int             gWimpyReductionFactor = 128;
int             gSkipTesting = 0;
int             gForceFTZ = 0;
int             gMultithread = 1;
int             gIsRTZ = 0;
uint32_t        gSimdSize = 1;
int             gHasDouble = 0;
int                   gIsEmbedded = 0;
int             gHasLong = 1;
int             gTestDouble = 1;
cl_uint         choosen_device_index = 0;
const char *    sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
const int       vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
int             gMinVectorSize = 0;
int             gMaxVectorSize = sizeof(vectorSizes) / sizeof( vectorSizes[0] );

#pragma mark -
#pragma mark Declarations

static int ParseArgs( int argc, const char **argv );
static void PrintUsage( void );
static void PrintArch(void);
static int InitCL( void );
static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round );
static int DoTest( Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d );
static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel );
static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount );

void *FlushToZero( void );
void UnFlushToZero( void *);

static cl_program CreateImplicitConvertProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );
static cl_program CreateStandardProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );


// Windows (since long double got deprecated) sets the x87 to 53-bit precision
// (that's x87 default state).  This causes problems with the tests that
// convert long and ulong to float and double or otherwise deal with values
// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
static inline void Force64BitFPUPrecision(void)
{
#if __MINGW32__
    // The usual method is to use _controlfp as follows:
    //     #include <float.h>
    //     _controlfp(_PC_64, _MCW_PC);
    //
    // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
    // divergent code just use inline assembly which works for both.
    unsigned short int orig_cw = 0;
    unsigned short int new_cw = 0;
    __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
    new_cw = orig_cw | 0x0300;   // set precision to 64-bit
    __asm__ __volatile__ ("fldcw  %0"::"m" (new_cw));
#else
    /* Implement for other platforms if needed */
#endif
}


#pragma mark -

int main (int argc, const char **argv )
{
    int error, i, testNumber = -1;
    Type inType, outType;
    RoundingMode round;
    SaturationMode sat;
    MTdata  d = NULL;
    cl_uint seed = (cl_uint) clock();

    test_start();

    if( (error = ParseArgs( argc, argv )) )
        return error;

    //Turn off sleep so our tests run to completion
    PreventSleep();
    atexit( ResumeSleep );

    // Init CL data structures
    if( (error = InitCL()) )
        return error;

#if defined(_MSC_VER) && defined(_M_IX86)
    // VS2005 (and probably others, since long double got deprecated) sets
    // the x87 to 53-bit precision. This causes problems with the tests
    // that convert long and ulong to float and double, since they deal
    // with values that need more precision than that. So, set the x87
    // to 64-bit precision.
    unsigned int ignored;
    _controlfp_s(&ignored, _PC_64, _MCW_PC);
#endif

    vlog( "===========================================================\n" );
    vlog( "Random seed: %u\n", seed );
    d = init_genrand( seed );
    int startMinVectorSize = gMinVectorSize;
    if( argCount )
    {
        for( i = 0; i < argCount; i++ )
        {
            if( GetTestCase( argList[i], &outType, &inType, &sat, &round ) )
            {
                vlog_error( "\n\t\t**** ERROR:  Unable to parse function name %s.  Skipping....  *****\n\n", argList[i] );
                continue;
            }

            // skip double if we don't have it
            if( !gTestDouble && (inType == kdouble || outType == kdouble ) )
            {
                if( gHasDouble )
                {
                    vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
                    vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
                }

                continue;
            }

            // skip longs on embedded
            if( ! gHasLong &&
               (inType == klong || outType == klong || inType == kulong || outType == kulong))
                continue;

            // Skip the implicit converts if the rounding mode is not default or test is saturated
            if( 0 == startMinVectorSize )
            {
                if( sat || round != kDefaultRoundingMode )
                    gMinVectorSize = 1;
                else
                    gMinVectorSize = 0;
            }

            if( (error = DoTest( outType, inType, sat, round, d )) )
                vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
        }
    }
    else
    {

        for( outType = (Type)0; outType < kTypeCount; outType = (Type)(outType+1) )
        {
            for( inType = (Type)0; inType < kTypeCount; inType = (Type)(inType+1) )
            {
                // skip longs on embedded
                if( ! gHasLong &&
                   (inType == klong || outType == klong || inType == kulong || outType == kulong))
                    continue;

                for( sat = (SaturationMode)0; sat < kSaturationModeCount; sat = (SaturationMode)(sat+1) )
                {
                    //skip illegal saturated conversions to float type
                    if( kSaturated == sat && ( outType == kfloat || outType == kdouble ) )
                        continue;

                    for( round = (RoundingMode)0; round < kRoundingModeCount; round = (RoundingMode)(round+1) )
                    {
                        if( ++testNumber < gStartTestNumber )
                        {
                            //     vlog( "%d) skipping convert_%sn%s%s( %sn )\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
                            continue;
                        }
                        else
                            if( gEndTestNumber > 0 && testNumber >= gEndTestNumber  )
                                goto exit;

                        vlog( "%d) Testing convert_%sn%s%s( %sn ):\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );

                        // skip double if we don't have it
                        if( ! gTestDouble && (inType == kdouble || outType == kdouble ) )
                        {
                            if( gHasDouble )
                            {
                                vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
                                vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
                            }
                            continue;
                        }

                        // Skip the implicit converts if the rounding mode is not default or test is saturated
                        if( 0 == startMinVectorSize )
                        {
                            if( sat || round != kDefaultRoundingMode )
                                gMinVectorSize = 1;
                            else
                                gMinVectorSize = 0;
                        }

                        if( (error = DoTest( outType, inType, sat, round, d) ) )
                            vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
                    }
                }
            }
        }
    }

exit:
    free_mtdata(d);
    vlog( "\n\n" );
    vlog( "Tests completed: %d\n", gTestCount );

    error = clFinish(gQueue);
    if (error)
        vlog_error("clFinish failed: %d\n", error);

    if (gFailCount == 0 && gTestCount > 0) {
        vlog("PASSED %d of %d tests.\n", gTestCount, gTestCount);
    } else if (gFailCount > 0) {
        vlog_error("FAILED %d of %d tests.\n", gFailCount, gTestCount);
    }

    clReleaseMemObject(gInBuffer);

    for( i = 0; i < kCallStyleCount; i++ ) {
        clReleaseMemObject(gOutBuffers[i]);
    }
    clReleaseCommandQueue(gQueue);
    clReleaseContext(gContext);

    test_finish();
    if (gFailCount > 0)
        return -1;

    return 0;
}

#pragma mark -
#pragma mark setup

static int ParseArgs( int argc, const char **argv )
{
    int i;
    argList = (const char **)calloc( argc - 1, sizeof( char*) );
    argCount = 0;

    if( NULL == argList && argc > 1 )
        return -1;

#if (defined( __APPLE__ ) || defined(__linux__) || defined (__MINGW32__))
    { // Extract the app name
        char baseName[ MAXPATHLEN ];
        strncpy( baseName, argv[0], MAXPATHLEN );
        char *base = basename( baseName );
        if( NULL != base )
        {
            strncpy( appName, base, sizeof( appName )  );
            appName[ sizeof( appName ) -1 ] = '\0';
        }
    }
#elif defined (_WIN32)
    {
        char fname[_MAX_FNAME + _MAX_EXT + 1];
        char ext[_MAX_EXT];

        errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
                                   fname, _MAX_FNAME, ext, _MAX_EXT );
        if (err == 0) { // no error
            strcat (fname, ext); //just cat them, size of frame can keep both
            strncpy (appName, fname, sizeof(appName));
            appName[ sizeof( appName ) -1 ] = '\0';
        }
    }
#endif

    /* Check for environment variable to set device type */
    char *env_mode = getenv( "CL_DEVICE_TYPE" );
    if( env_mode != NULL )
    {
        vlog( "CL_DEVICE_TYPE: %s\n", env_mode );
        if( strcmp( env_mode, "gpu" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_GPU" ) == 0 )
            gDeviceType = CL_DEVICE_TYPE_GPU;
        else if( strcmp( env_mode, "cpu" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_CPU" ) == 0 )
            gDeviceType = CL_DEVICE_TYPE_CPU;
        else if( strcmp( env_mode, "accelerator" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
            gDeviceType = CL_DEVICE_TYPE_ACCELERATOR;
        else if( strcmp( env_mode, "default" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
            gDeviceType = CL_DEVICE_TYPE_DEFAULT;
        else
        {
            vlog_error( "Unknown CL_DEVICE_TYPE env variable setting: %s.\nAborting...\n", env_mode );
            abort();
        }
    }


    vlog( "\n%s", appName );
    for( i = 1; i < argc; i++ )
    {
        const char *arg = argv[i];
        if( NULL == arg )
            break;

        vlog( "\t%s", arg );
        if( arg[0] == '-' )
        {
            arg++;
            while( *arg != '\0' )
            {
                switch( *arg )
                {
                    case 'd':
                        gTestDouble ^= 1;
                        break;
                    case 'l':
                        gSkipTesting ^= 1;
                        break;
                    case 'm':
                        gMultithread ^= 1;
                        break;
                    case 'w':
                        gWimpyMode ^= 1;
                        break;
                    case '[':
                        parseWimpyReductionFactor(arg, gWimpyReductionFactor);
                        break;
                    case 'z':
                        gForceFTZ ^= 1;
                        break;
                    case 't':
                        gTimeResults ^= 1;
                        break;
                    case 'a':
                        gReportAverageTimes ^= 1;
                        break;
                    case '1':
                        if( arg[1] == '6' )
                        {
                            gMinVectorSize = 6;
                            gMaxVectorSize = 7;
                            arg++;
                        }
                        else
                        {
                            gMinVectorSize = 0;
                            gMaxVectorSize = 2;
                        }
                        break;

                    case '2':
                        gMinVectorSize = 2;
                        gMaxVectorSize = 3;
                        break;

                    case '3':
                        gMinVectorSize = 3;
                        gMaxVectorSize = 4;
                        break;

                    case '4':
                        gMinVectorSize = 4;
                        gMaxVectorSize = 5;
                        break;

                    case '8':
                        gMinVectorSize = 5;
                        gMaxVectorSize = 6;
                        break;

                    default:
                        vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
                        PrintUsage();
                        return -1;
                }
                arg++;
            }
        }
        // Check if a particular device id was requested
        else if (strlen(argv[i]) >= 3 && argv[i][0] == 'i' && argv[i][1] =='d')
        {
            choosen_device_index = atoi(&(argv[i][2]));
        }
        else
        {
            char *t = NULL;
            long number = strtol( arg, &t, 0 );
            if( t != arg )
            {
                if( gStartTestNumber != -1 )
                    gEndTestNumber = gStartTestNumber + (int) number;
                else
                    gStartTestNumber = (int) number;
            }
            else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_CPU"))
                gDeviceType = CL_DEVICE_TYPE_CPU;
            else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_GPU"))
                gDeviceType = CL_DEVICE_TYPE_GPU;
            else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_ACCELERATOR"))
                gDeviceType = CL_DEVICE_TYPE_ACCELERATOR;
            else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_DEFAULT"))
                gDeviceType = CL_DEVICE_TYPE_DEFAULT;
            else
            {
                argList[ argCount ] = arg;
                argCount++;
            }
        }
    }

    vlog( "\n" );

    vlog( "Test binary built %s %s\n", __DATE__, __TIME__ );

    PrintArch();

    if( gWimpyMode )
    {
        vlog( "\n" );
        vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
        vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
        vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" );
        vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor);
    }

    return 0;
}

static void PrintUsage( void )
{
    int i;
    vlog( "%s [-wz#]: <optional: test names>\n", appName );
    vlog( "\ttest names:\n" );
    vlog( "\t\tdestFormat<_sat><_round>_sourceFormat\n" );
    vlog( "\t\t\tPossible format types are:\n\t\t\t\t" );
    for( i = 0; i < kTypeCount; i++ )
        vlog( "%s, ", gTypeNames[i] );
    vlog( "\n\n\t\t\tPossible saturation values are: (empty) and _sat\n" );
    vlog( "\t\t\tPossible rounding values are:\n\t\t\t\t(empty), " );
    for( i = 1; i < kRoundingModeCount; i++ )
        vlog( "%s, ", gRoundingModeNames[i] );
    vlog( "\n\t\t\tExamples:\n" );
    vlog( "\t\t\t\tulong_short   converts short to ulong\n" );
    vlog( "\t\t\t\tchar_sat_rte_float   converts float to char with saturated clipping in round to nearest rounding mode\n\n" );
    vlog( "\toptions:\n" );
    vlog( "\t\t-d\tToggle testing of double precision.  On by default if cl_khr_fp64 is enabled, ignored otherwise.\n" );
    vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" );
    vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" );
    vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" );
    vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
    vlog( "\t\t-z\tToggle flush to zero mode  (Default: per device)\n" );
    vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" );
    vlog( "\n" );
    vlog( "You may also pass the number of the test on which to start.\nA second number can be then passed to indicate how many tests to run\n\n" );
}

static void PrintArch( void )
{
    vlog( "sizeof( void*) = %ld\n", sizeof( void *) );
#if defined( __ppc__ )
    vlog( "ARCH:\tppc\n" );
#elif defined( __ppc64__ )
    vlog( "ARCH:\tppc64\n" );
#elif defined( __PPC__ )
    vlog( "ARCH:\tppc\n" );
#elif defined( __i386__ )
    vlog( "ARCH:\ti386\n" );
#elif defined( __x86_64__ )
    vlog( "ARCH:\tx86_64\n" );
#elif defined( __arm__ )
    vlog( "ARCH:\tarm\n" );
#elif defined( __aarch64__ )
    vlog( "ARCH:\taarch64\n" );
#elif defined (_WIN32)
    vlog( "ARCH:\tWindows\n" );
#else
#error unknown arch
#endif

#if defined( __APPLE__ )

    int type = 0;
    size_t typeSize = sizeof( type );
    sysctlbyname( "hw.cputype", &type, &typeSize, NULL, 0 );
    vlog( "cpu type:\t%d\n", type );
    typeSize = sizeof( type );
    sysctlbyname( "hw.cpusubtype", &type, &typeSize, NULL, 0 );
    vlog( "cpu subtype:\t%d\n", type );

#elif defined( __linux__ )
#define OSNAMESZ 100
    int _sysctl(struct __sysctl_args *args );

    struct __sysctl_args args;
    char osname[OSNAMESZ];
    size_t osnamelth;
    int name[] = { CTL_KERN, KERN_OSTYPE };
    memset(&args, 0, sizeof(struct __sysctl_args));
    args.name = name;
    args.nlen = sizeof(name)/sizeof(name[0]);
    args.oldval = osname;
    args.oldlenp = &osnamelth;

    osnamelth = sizeof(osname);

    if (syscall(SYS__sysctl, &args) == -1) {
        vlog( "_sysctl error\n" );
    }
    else {
        vlog("this machine is running %*s\n", osnamelth, osname);
    }

#endif
}


static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round )
{
    int i;

    //Find the return type
    for( i = 0; i < kTypeCount; i++ )
        if( name == strstr( name, gTypeNames[i] ) )
        {
            *outType = (Type)i;
            name += strlen( gTypeNames[i] );

            break;
        }

    if( i == kTypeCount )
        return -1;

    // Check to see if _sat appears next
    *sat = (SaturationMode)0;
    for( i = 1; i < kSaturationModeCount; i++ )
        if( name == strstr( name, gSaturationNames[i] ) )
        {
            *sat = (SaturationMode)i;
            name += strlen( gSaturationNames[i] );
            break;
        }

    *round = (RoundingMode)0;
    for( i = 1; i < kRoundingModeCount; i++ )
        if( name == strstr( name, gRoundingModeNames[i] ) )
        {
            *round = (RoundingMode)i;
            name += strlen( gRoundingModeNames[i] );
            break;
        }

    if( *name != '_' )
        return -2;
    name++;

    for( i = 0; i < kTypeCount; i++ )
        if( name == strstr( name, gTypeNames[i] ) )
        {
            *inType = (Type)i;
            name += strlen( gTypeNames[i] );

            break;
        }

    if( i == kTypeCount )
        return -3;

    if( *name != '\0' )
        return -4;

    return 0;
}

#pragma mark -
#pragma mark OpenCL

static void CL_CALLBACK notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
{
    vlog( "%s\n", errinfo );
}

static int InitCL( void )
{
    int error, i;
    size_t configSize = sizeof( gComputeDevices );

    cl_platform_id     platform = NULL;
    cl_uint            num_devices = 0;
    cl_device_id       *devices = NULL;

    /* Get the platform */
    error = clGetPlatformIDs(1, &platform, NULL);
    if (error) {
        vlog_error( "clGetPlatformIDs failed: %d\n", error );
        return error;
    }

    /* Get the number of requested devices */
    error = clGetDeviceIDs(platform,  gDeviceType, 0, NULL, &num_devices );
    if (error) {
        vlog_error( "clGetDeviceIDs failed: %d\n", error );
        return error;
    }

    devices = (cl_device_id *) malloc( num_devices * sizeof( cl_device_id ) );
    if (!devices || choosen_device_index >= num_devices) {
        vlog_error( "device index out of range -- choosen_device_index (%d) >= num_devices (%d)\n", choosen_device_index, num_devices );
        return -1;
    }

    /* Get the requested device */
    error = clGetDeviceIDs(platform,  gDeviceType, num_devices, devices, NULL );
    if (error) {
        vlog_error( "clGetDeviceIDs failed: %d\n", error );
        return error;
    }

    gDevice = devices[choosen_device_index];
    free(devices);
    devices = NULL;

    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) )
        gComputeDevices = 1;

    configSize = sizeof( gDeviceFrequency );
    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) )
        gDeviceFrequency = 0;

    cl_device_fp_config floatCapabilities = 0;
    if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities,  NULL)))
        floatCapabilities = 0;
    if(0 == (CL_FP_DENORM & floatCapabilities) )
        gForceFTZ ^= 1;

    if( 0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST ) )
    {
        char profileStr[128] = "";
        // Verify that we are an embedded profile device
        if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_PROFILE, sizeof( profileStr ), profileStr, NULL ) ) )
        {
            vlog_error( "FAILURE: Could not get device profile: error %d\n", error );
            return -1;
        }

        if( strcmp( profileStr, "EMBEDDED_PROFILE" ) )
        {
            vlog_error( "FAILURE: non-embedded profile device does not support CL_FP_ROUND_TO_NEAREST\n" );
            return -1;
        }

        if( 0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO ) )
        {
            vlog_error( "FAILURE: embedded profile device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" );
            return -1;
        }

        gIsRTZ = 1;
    }

    char extensions[2048] = "";
    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_EXTENSIONS, sizeof( extensions ), extensions,  NULL ) ) )
    {
        vlog_error( "FAILURE: unable to get device info for CL_DEVICE_EXTENSIONS!" );
        return -1;
    }
    else if( strstr( extensions, "cl_khr_fp64" ) )
    {
        gHasDouble = 1;
    }
    gTestDouble &= gHasDouble;

    //detect whether profile of the device is embedded
    char profile[1024] = "";
    if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ) ) ){}
    else if( strstr(profile, "EMBEDDED_PROFILE" ) )
    {
        gIsEmbedded = 1;
        if( !strstr( extensions, "cles_khr_int64" ) )
            gHasLong = 0;
    }


    gContext = clCreateContext( NULL, 1, &gDevice, notify_callback, NULL, &error );
    if( NULL == gDevice || error )
    {
        vlog_error( "clCreateContext failed. (%d)\n", error );
        return error;
    }

    gQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
    if( NULL == gQueue || error )
    {
        vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
        return error;
    }

    //Allocate buffers
    //FIXME: use clProtectedArray for guarded allocations?
    gIn   = malloc( BUFFER_SIZE + 2 * kPageSize );
    gAllowZ = malloc( BUFFER_SIZE + 2 * kPageSize );
    gRef  = malloc( BUFFER_SIZE + 2 * kPageSize );
    for( i = 0; i < kCallStyleCount; i++ )
    {
        gOut[i] = malloc( BUFFER_SIZE + 2 * kPageSize );
        if( NULL == gOut[i] )
            return -3;
    }

    // setup input buffers
    gInBuffer = clCreateBuffer(gContext, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &error);
    if( gInBuffer == NULL || error)
    {
        vlog_error( "clCreateBuffer failed for input (%d)\n", error );
        return error;
    }

    // setup output buffers
    for( i = 0; i < kCallStyleCount; i++ )
    {
        gOutBuffers[i] = clCreateBuffer(  gContext, CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, &error );
        if( gOutBuffers[i] == NULL || error )
        {
            vlog_error( "clCreateArray failed for output (%d)\n", error );
            return error;
        }
    }

#if defined( __APPLE__ )

#if defined( __i386__ ) || defined( __x86_64__ )
#define    kHasSSE3                0x00000008
#define kHasSupplementalSSE3    0x00000100
#define    kHasSSE4_1              0x00000400
#define    kHasSSE4_2              0x00000800
    /* check our environment for a hint to disable SSE variants */
    {
        const char *env = getenv( "CL_MAX_SSE" );
        if( env )
        {
            extern int _cpu_capabilities;
            int mask = 0;
            if( 0 == strcasecmp( env, "SSE4.1" ) )
                mask = kHasSSE4_2;
            else if( 0 == strcasecmp( env, "SSSE3" ) )
                mask = kHasSSE4_2 | kHasSSE4_1;
            else if( 0 == strcasecmp( env, "SSE3" ) )
                mask = kHasSSE4_2 | kHasSSE4_1 | kHasSupplementalSSE3;
            else if( 0 == strcasecmp( env, "SSE2" ) )
                mask = kHasSSE4_2 | kHasSSE4_1 | kHasSupplementalSSE3 | kHasSSE3;
            else
            {
                vlog_error( "Error: Unknown CL_MAX_SSE setting: %s\n", env );
                return -2;
            }

            vlog( "*** Environment: CL_MAX_SSE = %s ***\n", env );
            _cpu_capabilities &= ~mask;
        }
    }
#endif
#endif


    char c[1024];
    static const char *no_yes[] = { "NO", "YES" };
    vlog( "\nCompute Device info:\n" );
    clGetDeviceInfo(gDevice, CL_DEVICE_NAME, sizeof(c), c, NULL);
    vlog( "\tDevice Name: %s\n", c );
    clGetDeviceInfo(gDevice, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
    vlog( "\tVendor: %s\n", c );
    clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(c), c, NULL);
    vlog( "\tDevice Version: %s\n", c );
    clGetDeviceInfo(gDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
    vlog( "\tCL C Version: %s\n", c );
    clGetDeviceInfo(gDevice, CL_DRIVER_VERSION, sizeof(c), c, NULL);
    vlog( "\tDriver Version: %s\n", c );
    vlog( "\tProcessing with %ld devices\n", gComputeDevices );
    vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency );
    vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
    vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ] );
    vlog( "\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ] );
    vlog( "\tHas Double? %s\n", no_yes[0 != gHasDouble] );
    if( gHasDouble )
        vlog( "\tTest Double? %s\n", no_yes[0 != gTestDouble] );
    vlog( "\tHas Long? %s\n", no_yes[0 != gHasLong] );
    vlog( "\tTesting vector sizes: " );
    for( i = gMinVectorSize; i < gMaxVectorSize; i++ )
        vlog("\t%d", vectorSizes[i]);
    vlog( "\n" );
    return 0;
}

static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount )
{
    // The global dimensions are just the blockCount to execute since we haven't set up multiple queues for multiple devices.
    int error;

    error = clSetKernelArg(kernel, 0, sizeof( inBuf ), &inBuf);
    error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);

    if( error )
    {
        vlog_error( "FAILED -- could not set kernel args (%d)\n", error );
        return error;
    }

    if( (error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, NULL, 0, NULL, NULL)))
    {
        vlog_error( "FAILED -- could not execute kernel (%d)\n", error );
        return error;
    }

    return 0;
}

#if ! defined( __APPLE__ )
static void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
static void memset_pattern4(void *dest, const void *src_pattern, size_t bytes )
{
    uint32_t pat = ((uint32_t*) src_pattern)[0];
    size_t count = bytes / 4;
    size_t i;
    uint32_t *d = (uint32_t *)dest;

    for( i = 0; i < count; i++ )
        d[i] = pat;

    d += i;

    bytes &= 3;
    if( bytes )
        memcpy( d, src_pattern, bytes );
}

#endif

#if defined( __APPLE__ )
#include <mach/mach_time.h>
#endif

uint64_t GetTime( void );
uint64_t GetTime( void )
{
#if defined( __APPLE__ )
    return mach_absolute_time();
#elif defined(_MSC_VER)
    return  ReadTime();
#else
    //mach_absolute_time is a high precision timer with precision < 1 microsecond.
#warning need accurate clock here.  Times are invalid.
    return 0;
#endif
}


#if defined (_MSC_VER)
/* function is defined in "compat.h" */
#else
double SubtractTime( uint64_t endTime, uint64_t startTime );
double SubtractTime( uint64_t endTime, uint64_t startTime )
{
    uint64_t diff = endTime - startTime;
    static double conversion = 0.0;

    if( 0.0 == conversion )
    {
#if defined( __APPLE__ )
        mach_timebase_info_data_t info = {0,0};
        kern_return_t   err = mach_timebase_info( &info );
        if( 0 == err )
            conversion = 1e-9 * (double) info.numer / (double) info.denom;
#else
        // This function consumes output from GetTime() above, and converts the time to secionds.
#warning need accurate ticks to seconds conversion factor here. Times are invalid.
#endif
    }

    // strictly speaking we should also be subtracting out timer latency here
    return conversion * (double) diff;
}
#endif

typedef struct CalcReferenceValuesInfo
{
    struct WriteInputBufferInfo *parent;        // pointer back to the parent WriteInputBufferInfo struct
    cl_kernel                   kernel;         // the kernel for this vector size
    cl_program                  program;        // the program for this vector size
    cl_uint                     vectorSize;     // the vector size for this callback chain
    void                        *p;             // the pointer to mapped result data for this vector size
    cl_int                      result;
}CalcReferenceValuesInfo;

typedef struct WriteInputBufferInfo
{
    volatile cl_event           calcReferenceValues;   // user event which signals when main thread is done calculating reference values
    volatile cl_event           doneBarrier;     // user event which signals when worker threads are done
    cl_uint                     count;           // the number of elements in the array
    Type                        outType;         // the data type of the conversion result
    Type                        inType;          // the data type of the conversion input
    volatile int                barrierCount;
    CalcReferenceValuesInfo     calcInfo[kCallStyleCount];
}WriteInputBufferInfo;

cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
cl_uint RoundUpToNextPowerOfTwo( cl_uint x )
{
    if( 0 == (x & (x-1)))
        return x;

    while( x & (x-1) )
       x &= x-1;

    return x + x;
}

void CL_CALLBACK WriteInputBufferComplete( cl_event, cl_int, void * );

typedef struct DataInitInfo
{
    cl_ulong        start;
    cl_uint         size;
    Type            outType;
    Type            inType;
    SaturationMode  sat;
    RoundingMode    round;
    MTdata          *d;
}DataInitInfo;

cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p );
cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p )
{
    DataInitInfo *info = (DataInitInfo*) p;

    gInitFunctions[ info->inType ]( (char*)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, info->round,
                                   info->outType, info->start + job_id * info->size, info->size, info->d[thread_id] );
    return CL_SUCCESS;
}

static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
{
    cl_uint i;
    for (i = 0; i < count; ++i)
    allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
}

cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p );
cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
{
    DataInitInfo *info = (DataInitInfo*) p;
    cl_uint count = info->size;
    Type inType = info->inType;
    Type outType = info->outType;
    RoundingMode round = info->round;
    size_t j;

    Force64BitFPUPrecision();

    void *s = (cl_uchar*) gIn + job_id * count * gTypeSizes[info->inType];
    void *a = (cl_uchar*) gAllowZ + job_id * count;
    void *d = (cl_uchar*) gRef + job_id * count * gTypeSizes[info->outType];

    if (outType != inType)
    {
        //create the reference while we wait
        Convert f = gConversions[ outType ][ inType ];
        if( info->sat )
            f = gSaturatedConversions[ outType ][ inType ];

        RoundingMode oldRound = set_round( round, outType );
        f( d, s, count );
        set_round( oldRound, outType );

    // Decide if we allow a zero result in addition to the correctly rounded one
        memset(a, 0, count);
    if (gForceFTZ) {
        if (inType == kfloat)
        setAllowZ((uint8_t*)a, (uint32_t*)s, count);
        if (outType == kfloat)
        setAllowZ((uint8_t*)a, (uint32_t*)d, count);
    }
    }
    else
    {
        // Copy the input to the reference
        memcpy(d, s, info->size * gTypeSizes[inType]);
    }

    //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
    if( info->outType != kfloat && info->outType != kdouble )
    {
        if( inType == kfloat )
        {
            float *inp = (float*) s;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) )
                    memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
            }
        }
        if( inType == kdouble )
        {
            double *inp = (double*) s;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) )
                    memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
            }
        }
    }
    else if( inType == kfloat || inType == kdouble )
    {  // outtype and intype is float or double.  NaN conversions for float <-> double can be any NaN
        if( inType == kfloat && outType == kdouble )
        {
            float *inp = (float*) s;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) )
                    ((double*) d)[j] = NAN;
            }
        }
        if( inType == kdouble && outType == kfloat )
        {
            double *inp = (double*) s;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) )
                    ((float*) d)[j] = NAN;
            }
        }
    }

    return CL_SUCCESS;
}

static int DoTest( Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d )
{
#ifdef __APPLE__
    cl_ulong wall_start = mach_absolute_time();
#endif

    DataInitInfo  init_info = { 0, 0, outType, inType, sat, round, NULL };
    WriteInputBufferInfo writeInputBufferInfo;
    int vectorSize;
    int error = 0;
    cl_uint threads = GetThreadCount();
    uint64_t i;

    gTestCount++;
    size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] );
    size_t step = blockCount;
    uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]);
    cl_event writeInputBuffer = NULL;

    memset( &writeInputBufferInfo, 0, sizeof( writeInputBufferInfo ) );
    init_info.d = (MTdata*)malloc( threads * sizeof( MTdata ) );
    if( NULL == init_info.d )
    {
        vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
        return -1;
    }
    for( i = 0; i < threads; i++ )
    {
        init_info.d[i] = init_genrand( genrand_int32( d ) );
        if( NULL == init_info.d[i] )
        {
            vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
            return -1;
        }
    }

    writeInputBufferInfo.outType = outType;
    writeInputBufferInfo.inType = inType;

    for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        writeInputBufferInfo.calcInfo[vectorSize].program = MakeProgram( outType, inType, sat, round, vectorSize,
                                                                        &writeInputBufferInfo.calcInfo[vectorSize].kernel );
        if( NULL == writeInputBufferInfo.calcInfo[vectorSize].program )
        {
            gFailCount++;
            return -1;
        }
        if( NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel )
        {
            gFailCount++;
            vlog_error( "\t\tFAILED -- Failed to create kernel.\n" );
            return -2;
        }

        writeInputBufferInfo.calcInfo[vectorSize].parent = &writeInputBufferInfo;
        writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
        writeInputBufferInfo.calcInfo[vectorSize].result = -1;
    }

    if( gSkipTesting )
        goto exit;

    // Patch up rounding mode if default is RTZ
    // We leave the part above in default rounding mode so that the right kernel is compiled.
    if( round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat) )
        init_info.round = round = kRoundTowardZero;

    // Figure out how many elements are in a work block

    // we handle 64-bit types a bit differently.
    if( 8*gTypeSizes[ inType ] > 32 )
        lastCase = 0x100000000ULL;

    if ( gWimpyMode )
        step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
    vlog( "Testing... " );
    fflush(stdout);
    for( i = 0; i < (uint64_t)lastCase; i += step )
    {

        if( 0 == ( i & ((lastCase >> 3) -1))) {
            vlog(".");
            fflush(stdout);
        }

        cl_uint count = (uint32_t) MIN( blockCount, lastCase - i );
        writeInputBufferInfo.count = count;

        // Crate a user event to represent the status of the reference value computation completion
        writeInputBufferInfo.calcReferenceValues = clCreateUserEvent( gContext, &error);
        if( error || NULL == writeInputBufferInfo.calcReferenceValues )
        {
            vlog_error( "ERROR: Unable to create user event. (%d)\n", error );
            gFailCount++;
            goto exit;
        }

        // retain for consumption by MapOutputBufferComplete
        for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
        {
            if( (error = clRetainEvent(writeInputBufferInfo.calcReferenceValues) ))
            {
                vlog_error( "ERROR: Unable to retain user event. (%d)\n", error );
                gFailCount++;
                goto exit;
            }
        }

        // Crate a user event to represent when the callbacks are done verifying correctness
        writeInputBufferInfo.doneBarrier = clCreateUserEvent( gContext, &error);
        if( error || NULL == writeInputBufferInfo.calcReferenceValues )
        {
            vlog_error( "ERROR: Unable to create user event for barrier. (%d)\n", error );
            gFailCount++;
            goto exit;
        }

        // retain for use by the callback that calls this
        if( (error = clRetainEvent(writeInputBufferInfo.doneBarrier) ))
        {
            vlog_error( "ERROR: Unable to retain user event doneBarrier. (%d)\n", error );
            gFailCount++;
            goto exit;
        }

        //      Call this in a multithreaded manner
        //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d );
        cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
        init_info.start = i;
        init_info.size = count / chunks;
        if( init_info.size < 16384 )
        {
            chunks = RoundUpToNextPowerOfTwo(threads);
            init_info.size = count / chunks;
            if( init_info.size < 16384 )
            {
                init_info.size = count;
                chunks = 1;
            }
        }
        ThreadPool_Do(InitData, chunks, &init_info);

        // Copy the results to the device
        writeInputBuffer = NULL;
        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, count * gTypeSizes[inType], gIn, 0, NULL, &writeInputBuffer )))
        {
            vlog_error( "ERROR: clEnqueueWriteBuffer failed. (%d)\n", error );
            gFailCount++;
            goto exit;
        }

        // Setup completion callback for the write, which will enqueue the rest of the work
        // This is somewhat gratuitous.  Because this is an in order queue, we didn't really need to
        // do this work in a callback. We could have done it from the main thread.  Here we are
        // verifying that the implementation can enqueue work from a callback, while at the same time
        // also checking to make sure that the conversions work.
        //
        // Because the verification code is also moved to a callback, it is hoped that implementations will
        // achieve a test performance improvement because they can verify the results in parallel.  If the
        // implementation serializes callbacks however, that won't happen.   Consider it some motivation
        // to do the right thing! :-)
        if( (error = clSetEventCallback( writeInputBuffer, CL_COMPLETE, WriteInputBufferComplete, &writeInputBufferInfo)) )
        {
            vlog_error( "ERROR: clSetEventCallback failed. (%d)\n", error );
            gFailCount++;
            goto exit;
        }

        // The event can't be destroyed until the callback is called, so we can release it now.
        if( (error = clReleaseEvent(writeInputBuffer) ))
        {
            vlog_error( "ERROR: clReleaseEvent failed. (%d)\n", error );
            gFailCount++;
            goto exit;
        }

        // Make sure the work is actually running, so we don't deadlock
        if( (error = clFlush( gQueue ) ) )
        {
            vlog_error( "clFlush failed with error %d\n", error );
            gFailCount++;
            goto exit;
        }

        ThreadPool_Do(PrepareReference, chunks, &init_info);

        // signal we are done calculating the reference results
        if( (error = clSetUserEventStatus( writeInputBufferInfo.calcReferenceValues, CL_COMPLETE ) ) )
        {
            vlog_error( "Error:  Failed to set user event status to CL_COMPLETE:  %d\n", error );
            gFailCount++;
            goto exit;
        }

        // Wait for the event callbacks to finish verifying correctness.
        if( (error = clWaitForEvents( 1, (cl_event*) &writeInputBufferInfo.doneBarrier ) ))
        {
            vlog_error( "Error:  Failed to wait for barrier:  %d\n", error );
            gFailCount++;
            goto exit;
        }

        if( (error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues ) ))
        {
            vlog_error( "Error:  Failed to release calcReferenceValues:  %d\n", error );
            gFailCount++;
            goto exit;
        }

        if( (error = clReleaseEvent(writeInputBufferInfo.doneBarrier ) ))
        {
            vlog_error( "Error:  Failed to release done barrier:  %d\n", error );
            gFailCount++;
            goto exit;
        }


        for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
        {
            if( ( error = writeInputBufferInfo.calcInfo[ vectorSize ].result ))
            {
                switch( inType )
                {
                    case kuchar:
                    case kchar:
                        vlog( "Input value: 0x%2.2x ", ((unsigned char*)gIn)[error - 1] );
                        break;
                    case kushort:
                    case kshort:
                        vlog( "Input value: 0x%4.4x ", ((unsigned short*)gIn)[error - 1] );
                        break;
                    case kuint:
                    case kint:
                        vlog( "Input value: 0x%8.8x ", ((unsigned int*)gIn)[error - 1] );
                        break;
                    case kfloat:
                        vlog( "Input value: %a ", ((float*)gIn)[error - 1] );
                        break;
                        break;
                    case kulong:
                    case klong:
                        vlog( "Input value: 0x%16.16llx ", ((unsigned long long*)gIn)[error - 1] );
                        break;
                    case kdouble:
                        vlog( "Input value: %a ", ((double*)gIn)[error - 1]);
                        break;
                    default:
                        vlog_error( "Internal error at %s: %d\n", __FILE__, __LINE__ );
                        abort();
                        break;
                }

                // tell the user which conversion it was.
                if( 0 == vectorSize )
                    vlog( " (implicit scalar conversion from %s to %s)\n", gTypeNames[ inType ], gTypeNames[ outType ] );
                else
                    vlog( " (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], sizeNames[vectorSize], gSaturationNames[ sat ],
                                                            gRoundingModeNames[ round ], gTypeNames[inType], sizeNames[vectorSize] );

                gFailCount++;
                goto exit;
            }
        }
    }

    log_info( "done.\n" );

    if( gTimeResults )
    {
        //Kick off tests for the various vector lengths
        for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
        {
            size_t workItemCount = blockCount / vectorSizes[vectorSize];
            if( vectorSizes[vectorSize] * gTypeSizes[outType] < 4 )
                workItemCount /= 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);

            double sum = 0.0;
            double bestTime = INFINITY;
            cl_uint k;
            for( k = 0; k < PERF_LOOP_COUNT; k++ )
            {
                uint64_t startTime = GetTime();
                if( (error = RunKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
                {
                    gFailCount++;
                    goto exit;
                }

                // Make sure OpenCL is done
                if( (error = clFinish(gQueue) ) )
                {
                    vlog_error( "Error %d at clFinish\n", error );
                    goto exit;
                }

                uint64_t endTime = GetTime();
                double time = SubtractTime( endTime, startTime );
                sum += time;
                if( time < bestTime )
                    bestTime = time;

            }

            if( gReportAverageTimes )
                bestTime = sum / PERF_LOOP_COUNT;
            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (workItemCount * vectorSizes[vectorSize]);
            if( 0 == vectorSize )
                vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "implicit convert %s -> %s", gTypeNames[ inType ], gTypeNames[ outType ] );
            else
                vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "convert_%s%s%s%s( %s%s )", gTypeNames[ outType ], sizeNames[vectorSize], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType], sizeNames[vectorSize] );
        }
    }

    if( gWimpyMode )
        vlog( "\tWimp pass" );
    else
        vlog( "\tpassed" );

#ifdef __APPLE__
    // record the run time
    vlog( "\t(%f s)", 1e-9 * ( mach_absolute_time() - wall_start ) );
#endif
    vlog( "\n\n" );
    fflush( stdout );


exit:
    //clean up
    for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        clReleaseProgram( writeInputBufferInfo.calcInfo[vectorSize].program );
        clReleaseKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel );
    }

    if( init_info.d )
    {
        for( i = 0; i < threads; i++ )
            free_mtdata(init_info.d[i]);
        free(init_info.d);
    }

    return error;
}

void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data );

// Note: not called reentrantly
void CL_CALLBACK WriteInputBufferComplete( cl_event e, cl_int status, void *data )
{
    WriteInputBufferInfo *info = (WriteInputBufferInfo*) data;
    cl_uint count = info->count;
    int vectorSize;

    if( CL_SUCCESS != status )
    {
        vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
        gFailCount++;
        return;
    }

    info->barrierCount = gMaxVectorSize - gMinVectorSize;

    // now that we know that the write buffer is complete, enqueue callbacks to wait for the main thread to
    // finish calculating the reference results.
    for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        size_t workItemCount = (count + vectorSizes[vectorSize] - 1) / ( vectorSizes[vectorSize]);
        cl_event mapComplete = NULL;

        if( (status = RunKernel( info->calcInfo[ vectorSize ].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
        {
            gFailCount++;
            return;
        }

        info->calcInfo[vectorSize].p = clEnqueueMapBuffer( gQueue, gOutBuffers[ vectorSize ], CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
                                                          0, count * gTypeSizes[ info->outType ], 0, NULL, &mapComplete, &status);
        {
            if( status )
            {
                vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
                gFailCount++;
                return;
            }
        }

        if( (status = clSetEventCallback( mapComplete, CL_COMPLETE, MapResultValuesComplete, info->calcInfo + vectorSize)))
        {
            vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
            gFailCount++;
            return;
        }

        if( (status = clReleaseEvent(mapComplete)))
        {
            vlog_error( "ERROR: clReleaseEvent calback failed in WriteInputBufferComplete for vector size %d with status: %d\n", vectorSize, status );
            gFailCount++;
            return;
        }
    }

    // Make sure the work starts moving -- otherwise we may deadlock
    if( (status = clFlush(gQueue)))
    {
        vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
        gFailCount++;
        return;
    }

    // e was already released by the main thread. It should be destroyed automatically soon after we exit.
}

void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data );

// Note: May be called reentrantly
void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data )
{
    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data;
    cl_event calcReferenceValues = info->parent->calcReferenceValues;

    if( CL_SUCCESS != status )
    {
        vlog_error( "ERROR: MapResultValuesComplete calback failed with status: %d\n", status );
        gFailCount++;       // not thread safe -- being lazy here
        clReleaseEvent(calcReferenceValues);
        return;
    }

    // we know that the map is done, wait for the main thread to finish calculating the reference values
    if( (status = clSetEventCallback( calcReferenceValues, CL_COMPLETE, CalcReferenceValuesComplete, data )))
    {
        vlog_error( "ERROR: clSetEventCallback failed in MapResultValuesComplete with status: %d\n", status );
        gFailCount++;       // not thread safe -- being lazy here
    }

    // this thread no longer needs its reference to info->calcReferenceValues, so release it
    if( (status = clReleaseEvent(calcReferenceValues) ))
    {
        vlog_error( "ERROR: clReleaseEvent(info->calcReferenceValues) failed with status: %d\n", status );
        gFailCount++;       // not thread safe -- being lazy here
    }

    // no need to flush since we didn't enqueue anything

    // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after we exit.
}


void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data )
{
    CalcReferenceValuesInfo     *info = (CalcReferenceValuesInfo*) data;
    cl_uint                     vectorSize = info->vectorSize;
    cl_uint                     count = info->parent->count;
    Type                        outType = info->parent->outType;        // the data type of the conversion result
    Type                        inType = info->parent->inType;          // the data type of the conversion input
    size_t                      j;
    cl_int                      error;
    cl_event                    doneBarrier = info->parent->doneBarrier;

    // report spurious error condition
    if( CL_SUCCESS != status )
    {
        vlog_error( "ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", status );
        gFailCount++;       // lazy about thread safety here
        return;
    }

    // Now we know that both results have been mapped back from the device, and the
    // main thread is done calculating the reference results. It is now time to check
    // the results.

    // verify results
    void *mapped = info->p;

    //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
    if( outType != kfloat && outType != kdouble )
    {
        if( inType == kfloat )
        {
            float *inp = (float*) gIn;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) )
                    memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
            }
        }
        if( inType == kdouble )
        {
            double *inp = (double*) gIn;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) )
                    memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
            }
        }
    }
    else if( inType == kfloat || inType == kdouble )
    {  // outtype and intype is float or double.  NaN conversions for float <-> double can be any NaN
        if( inType == kfloat && outType == kdouble )
        {
            float *inp = (float*) gIn;
            double *outp = (double*) mapped;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) && isnan(outp[j]) )
                    outp[j] = NAN;
            }
        }
        if( inType == kdouble && outType == kfloat )
        {
            double *inp = (double*) gIn;
            float *outp = (float*) mapped;
            for( j = 0; j < count; j++ )
            {
                if( isnan( inp[j] ) && isnan(outp[j]) )
                    outp[j] = NAN;
            }
        }
    }

    if( memcmp( mapped, gRef, count * gTypeSizes[ outType ] ) )
        info->result = gCheckResults[outType]( mapped, gRef, gAllowZ, count, vectorSizes[vectorSize] );
    else
        info->result = 0;

    // Fill the output buffer with junk and release it
    {
        cl_uint pattern =  0xffffdead;
        memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
        if((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[ vectorSize ], mapped, 0, NULL, NULL)))
        {
            vlog_error( "ERROR: clEnqueueUnmapMemObject failed in CalcReferenceValuesComplete  (%d)\n", error );
            gFailCount++;
        }
    }

    if( 1 == ThreadPool_AtomicAdd( &info->parent->barrierCount, -1) )
    {
        if( (status = clSetUserEventStatus( doneBarrier, CL_COMPLETE) ))
        {
            vlog_error( "ERROR: clSetUserEventStatus failed in CalcReferenceValuesComplete (err: %d). We're probably going to deadlock.\n", status );
            gFailCount++;
            return;
        }

        if( (status = clReleaseEvent( doneBarrier ) ) )
        {
            vlog_error( "ERROR: clReleaseEvent failed in CalcReferenceValuesComplete (err: %d).\n", status );
            gFailCount++;
            return;
        }
    }


    // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after
    // all the calls to CalcReferenceValuesComplete exit.
}

static cl_program CreateImplicitConvertProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error )
{
    char inName[32];
    char outName[32];
    const char *programSource[] =
    {
        "", // optional pragma
        "__kernel void ", testName, "( __global ", inName,  " *src, __global ", outName, " *dest )\n"
        "{\n"
        "   size_t i = get_global_id(0);\n"
        "   dest[i] =  src[i];\n"
        "}\n"
    };
    size_t stringCount = sizeof( programSource ) / sizeof( programSource[0] );
    const char **strings = programSource;

    if (outType == kdouble || inType == kdouble)
        programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";

    //create the type name
    strncpy( inName, gTypeNames[ inType ], sizeof( inName ) );
    strncpy( outName, gTypeNames[ outType ], sizeof( outName ) );
    sprintf( testName, "test_implicit_%s_%s", outName, inName );
    vlog( "Building implicit %s -> %s conversion test\n", gTypeNames[ inType ],  gTypeNames[ outType ] );
    fflush(stdout);

    //create the program
    cl_program program = clCreateProgramWithSource(gContext, (cl_uint) stringCount, strings, NULL, error);
    if( NULL == program || *error )
    {
        vlog_error( "\t\tFAILED -- Failed to create program. (%d)\n", *error );
        return NULL;
    }
    return program;
}


static cl_program CreateStandardProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error )
{
    vectorSize = vectorSizes[ vectorSize ];

    char convertString[128];
    char inName[32];
    char outName[32];
    const char *programSource[] =
    {
        "", // optional pragma
        "__kernel void ", testName, "( __global ", inName,  " *src, __global ", outName, " *dest )\n"
        "{\n"
        "   size_t i = get_global_id(0);\n"
        "   dest[i] = ", convertString, "( src[i] );\n"
        "}\n"
    };
    const char *programSourceV3[] =
    {
        "", // optional pragma
        "__kernel void ", testName, "( __global ", inName,  " *src, __global ", outName, " *dest )\n"
        "{\n"
        "   size_t i = get_global_id(0);\n"
        "   if( i + 1 < get_global_size(0))\n"
        "       vstore3( ", convertString, "( vload3( i, src)), i, dest );\n"
        "   else\n"
        "   {\n"
        "       ", inName, "3 in;\n"
        "       ", outName, "3 out;\n"
        "       if( 0 == (i & 1) )\n"
        "           in.y = src[3*i+1];\n"
        "       in.x = src[3*i];\n"
        "       out = ", convertString, "( in ); \n"
        "       dest[3*i] = out.x;\n"
        "       if( 0 == (i & 1) )\n"
        "           dest[3*i+1] = out.y;\n"
        "   }\n"
        "}\n"
    };
    size_t stringCount = 3 == vectorSize ?  sizeof( programSourceV3 ) / sizeof( programSourceV3[0] ) :
    sizeof( programSource ) / sizeof( programSource[0] );
    const char **strings = 3 == vectorSize ? programSourceV3 : programSource;

    if (outType == kdouble || inType == kdouble) {
        programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
        programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
    }

    //create the type name
    switch (vectorSize)
    {
        case 1:
            strncpy( inName, gTypeNames[ inType ], sizeof( inName ) );
            strncpy( outName, gTypeNames[ outType ], sizeof( outName ) );
            snprintf( convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[ sat ], gRoundingModeNames[ round ] );
            snprintf( testName, 256, "test_%s_%s", convertString, inName );
            vlog( "Building %s( %s ) test\n", convertString, inName );
            break;
        case 3:
            strncpy( inName, gTypeNames[ inType ], sizeof( inName ) );
            strncpy( outName, gTypeNames[ outType ], sizeof( outName ) );
            snprintf( convertString, sizeof(convertString), "convert_%s3%s%s", outName, gSaturationNames[ sat ], gRoundingModeNames[ round ] );
            snprintf( testName, 256, "test_%s_%s3", convertString, inName );
            vlog( "Building %s( %s3 ) test\n", convertString, inName );
            break;
        default:
            snprintf( inName, sizeof( inName ), "%s%d", gTypeNames[ inType ],  vectorSize );
            snprintf( outName, sizeof( outName ), "%s%d", gTypeNames[ outType ],  vectorSize );
            snprintf( convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[ sat ], gRoundingModeNames[ round ] );
            snprintf( testName, 256, "test_%s_%s", convertString, inName );
            vlog( "Building %s( %s ) test\n", convertString, inName );
            break;
    }

    fflush(stdout);

    //create the program
    cl_program program = clCreateProgramWithSource(gContext, (cl_uint) stringCount, strings, NULL, error);
    if( NULL == program || *error )
    {
        vlog_error( "\t\tFAILED -- Failed to create program. (%d)\n", *error );
        return NULL;
    }
    return program;
}


static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel )
{
    cl_program program;
    char testName[256];
    int error = 0;

    // Create the program. This is a bit complicated because we are trying to avoid byte and short stores.
    if( 0 == vectorSize )
        program = CreateImplicitConvertProgram( outType, inType, sat, round, vectorSize, testName, &error );
    else
        program = CreateStandardProgram( outType, inType, sat, round, vectorSize, testName, &error );
    *outKernel = NULL;

    const char *flags = NULL;
    if( gForceFTZ )
        flags = "-cl-denorms-are-zero";

    // build it
    if( (error = clBuildProgram( program, 1, &gDevice, flags, NULL, NULL )))
    {
        char    buffer[2048] = "";

        vlog_error("\t\tFAILED -- clBuildProgramExecutable() failed: %d\n", error);
        clGetProgramBuildInfo(program, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
        vlog_error("Log: %s\n", buffer);

        clReleaseProgram( program );
        return NULL;
    }

    *outKernel = clCreateKernel(program, testName, &error);
    if( NULL == *outKernel || error)
    {
        char    buffer[2048] = "";

        vlog_error("\t\tFAILED -- clCreateKernel() failed (%d):\n", error);
        clGetProgramBuildInfo(program, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
        vlog_error("Log: %s\n", buffer);
        clReleaseProgram( program );
        return NULL;
    }

    return program;
}