OpenCL-CTS/test_conformance/conversions/test_conversions.cpp

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "harness/compat.h"
#include "harness/rounding_mode.h"
#include "harness/ThreadPool.h"
#include "harness/testHarness.h"
#include "harness/kernelHelpers.h"
#include "harness/parseParameters.h"
#if defined(__APPLE__)
#include <sys/sysctl.h>
#endif

#if defined(__linux__)
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/sysctl.h>
#endif
#if defined(__linux__)
#include <sys/param.h>
#include <libgen.h>
#endif

#include "mingw_compat.h"
#if defined(__MINGW32__)
#include <sys/param.h>
#endif

#include <sstream>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#if !defined(_WIN32)
#include <libgen.h>
#include <sys/mman.h>
#endif
#include <time.h>

#include <algorithm>

#include "Sleep.h"
#include "basic_test_conversions.h"

#if (defined(_WIN32) && defined(_MSC_VER))
// need for _controlfp_s and rouinding modes in RoundingMode
#include "harness/testHarness.h"
#endif

#pragma mark -
#pragma mark globals

#define BUFFER_SIZE (1024 * 1024)
#define kPageSize 4096
#define EMBEDDED_REDUCTION_FACTOR 16
#define PERF_LOOP_COUNT 100

#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)

#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
#include "fplib.h"
extern bool qcom_sat;
extern roundingMode qcom_rm;
#endif

const char **argList = NULL;
int argCount = 0;
cl_context gContext = NULL;
cl_command_queue gQueue = NULL;
char appName[64] = "ctest";
int gStartTestNumber = -1;
int gEndTestNumber = 0;
#if defined(__APPLE__)
int gTimeResults = 1;
#else
int gTimeResults = 0;
#endif
int gReportAverageTimes = 0;
void *gIn = NULL;
void *gRef = NULL;
void *gAllowZ = NULL;
void *gOut[kCallStyleCount] = { NULL };
cl_mem gInBuffer;
cl_mem gOutBuffers[kCallStyleCount];
size_t gComputeDevices = 0;
uint32_t gDeviceFrequency = 0;
int gWimpyMode = 0;
int gWimpyReductionFactor = 128;
int gSkipTesting = 0;
int gForceFTZ = 0;
int gMultithread = 1;
int gIsRTZ = 0;
uint32_t gSimdSize = 1;
int gHasDouble = 0;
int gTestDouble = 1;
const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
int gMinVectorSize = 0;
int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
static MTdata gMTdata;

#pragma mark -
#pragma mark Declarations

static int ParseArgs(int argc, const char **argv);
static void PrintUsage(void);
test_status InitCL(cl_device_id device);
static int GetTestCase(const char *name, Type *outType, Type *inType,
                       SaturationMode *sat, RoundingMode *round);
static int DoTest(cl_device_id device, Type outType, Type inType,
                  SaturationMode sat, RoundingMode round, MTdata d);
static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
                              RoundingMode round, int vectorSize,
                              cl_kernel *outKernel);
static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
                     size_t blockCount);

void *FlushToZero(void);
void UnFlushToZero(void *);

// Windows (since long double got deprecated) sets the x87 to 53-bit precision
// (that's x87 default state).  This causes problems with the tests that
// convert long and ulong to float and double or otherwise deal with values
// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
static inline void Force64BitFPUPrecision(void)
{
#if __MINGW32__
    // The usual method is to use _controlfp as follows:
    //     #include <float.h>
    //     _controlfp(_PC_64, _MCW_PC);
    //
    // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
    // divergent code just use inline assembly which works for both.
    unsigned short int orig_cw = 0;
    unsigned short int new_cw = 0;
    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
    new_cw = orig_cw | 0x0300; // set precision to 64-bit
    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
#else
    /* Implement for other platforms if needed */
#endif
}

int test_conversions(cl_device_id device, cl_context context,
                     cl_command_queue queue, int num_elements)
{
    int error, i, testNumber = -1;
    int startMinVectorSize = gMinVectorSize;
    Type inType, outType;
    RoundingMode round;
    SaturationMode sat;

    if (argCount)
    {
        for (i = 0; i < argCount; i++)
        {
            if (GetTestCase(argList[i], &outType, &inType, &sat, &round))
            {
                vlog_error("\n\t\t**** ERROR:  Unable to parse function name "
                           "%s.  Skipping....  *****\n\n",
                           argList[i]);
                continue;
            }

            // skip double if we don't have it
            if (!gTestDouble && (inType == kdouble || outType == kdouble))
            {
                if (gHasDouble)
                {
                    vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
                               gTypeNames[outType], gSaturationNames[sat],
                               gRoundingModeNames[round], gTypeNames[inType]);
                    vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
                         "off.\n");
                }

                continue;
            }

            // skip longs on embedded
            if (!gHasLong
                && (inType == klong || outType == klong || inType == kulong
                    || outType == kulong))
            {
                continue;
            }

            // Skip the implicit converts if the rounding mode is not default or
            // test is saturated
            if (0 == startMinVectorSize)
            {
                if (sat || round != kDefaultRoundingMode)
                    gMinVectorSize = 1;
                else
                    gMinVectorSize = 0;
            }

            if ((error = DoTest(device, outType, inType, sat, round, gMTdata)))
            {
                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
                           gTypeNames[outType], gSaturationNames[sat],
                           gRoundingModeNames[round], gTypeNames[inType]);
            }
        }
    }
    else
    {
        for (outType = (Type)0; outType < kTypeCount;
             outType = (Type)(outType + 1))
        {
            for (inType = (Type)0; inType < kTypeCount;
                 inType = (Type)(inType + 1))
            {
                // skip longs on embedded
                if (!gHasLong
                    && (inType == klong || outType == klong || inType == kulong
                        || outType == kulong))
                {
                    continue;
                }

                for (sat = (SaturationMode)0; sat < kSaturationModeCount;
                     sat = (SaturationMode)(sat + 1))
                {
                    // skip illegal saturated conversions to float type
                    if (kSaturated == sat
                        && (outType == kfloat || outType == kdouble))
                    {
                        continue;
                    }

                    for (round = (RoundingMode)0; round < kRoundingModeCount;
                         round = (RoundingMode)(round + 1))
                    {
                        if (++testNumber < gStartTestNumber)
                        {
                            //     vlog( "%d) skipping convert_%sn%s%s( %sn
                            //     )\n", testNumber, gTypeNames[ outType ],
                            //     gSaturationNames[ sat ],
                            //     gRoundingModeNames[round], gTypeNames[inType]
                            //     );
                            continue;
                        }
                        else
                        {
                            if (gEndTestNumber > 0
                                && testNumber >= gEndTestNumber)
                            {
                                goto exit;
                            }
                        }

                        vlog("%d) Testing convert_%sn%s%s( %sn ):\n",
                             testNumber, gTypeNames[outType],
                             gSaturationNames[sat], gRoundingModeNames[round],
                             gTypeNames[inType]);

                        // skip double if we don't have it
                        if (!gTestDouble
                            && (inType == kdouble || outType == kdouble))
                        {
                            if (gHasDouble)
                            {
                                vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
                                           "FAILED ** \n",
                                           testNumber, gTypeNames[outType],
                                           gSaturationNames[sat],
                                           gRoundingModeNames[round],
                                           gTypeNames[inType]);
                                vlog("\t\tcl_khr_fp64 enabled, but double "
                                     "testing turned off.\n");
                            }
                            continue;
                        }

                        // Skip the implicit converts if the rounding mode is
                        // not default or test is saturated
                        if (0 == startMinVectorSize)
                        {
                            if (sat || round != kDefaultRoundingMode)
                                gMinVectorSize = 1;
                            else
                                gMinVectorSize = 0;
                        }

                        if ((error = DoTest(device, outType, inType, sat, round,
                                            gMTdata)))
                        {
                            vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
                                       "FAILED ** \n",
                                       testNumber, gTypeNames[outType],
                                       gSaturationNames[sat],
                                       gRoundingModeNames[round],
                                       gTypeNames[inType]);
                        }
                    }
                }
            }
        }
    }

exit:
    return gFailCount;
}

test_definition test_list[] = {
    ADD_TEST(conversions),
};

const int test_num = ARRAY_SIZE(test_list);

#pragma mark -

int main(int argc, const char **argv)
{
    int error;
    cl_uint seed = (cl_uint)time(NULL);

    argc = parseCustomParam(argc, argv);
    if (argc == -1)
    {
        return 1;
    }

    if ((error = ParseArgs(argc, argv))) return error;

    // Turn off sleep so our tests run to completion
    PreventSleep();
    atexit(ResumeSleep);

    if (!gMultithread) SetThreadCount(1);

#if defined(_MSC_VER) && defined(_M_IX86)
    // VS2005 (and probably others, since long double got deprecated) sets
    // the x87 to 53-bit precision. This causes problems with the tests
    // that convert long and ulong to float and double, since they deal
    // with values that need more precision than that. So, set the x87
    // to 64-bit precision.
    unsigned int ignored;
    _controlfp_s(&ignored, _PC_64, _MCW_PC);
#endif

    vlog("===========================================================\n");
    vlog("Random seed: %u\n", seed);
    gMTdata = init_genrand(seed);

    const char *arg[] = { argv[0] };
    int ret =
        runTestHarnessWithCheck(1, arg, test_num, test_list, true, 0, InitCL);

    free_mtdata(gMTdata);
    if (gQueue)
    {
        error = clFinish(gQueue);
        if (error) vlog_error("clFinish failed: %d\n", error);
    }

    clReleaseMemObject(gInBuffer);

    for (int i = 0; i < kCallStyleCount; i++)
    {
        clReleaseMemObject(gOutBuffers[i]);
    }
    clReleaseCommandQueue(gQueue);
    clReleaseContext(gContext);

    return ret;
}

#pragma mark -
#pragma mark setup

static int ParseArgs(int argc, const char **argv)
{
    int i;
    argList = (const char **)calloc(argc, sizeof(char *));
    argCount = 0;

    if (NULL == argList && argc > 1) return -1;

#if (defined(__APPLE__) || defined(__linux__) || defined(__MINGW32__))
    { // Extract the app name
        char baseName[MAXPATHLEN];
        strncpy(baseName, argv[0], MAXPATHLEN);
        char *base = basename(baseName);
        if (NULL != base)
        {
            strncpy(appName, base, sizeof(appName));
            appName[sizeof(appName) - 1] = '\0';
        }
    }
#elif defined(_WIN32)
    {
        char fname[_MAX_FNAME + _MAX_EXT + 1];
        char ext[_MAX_EXT];

        errno_t err = _splitpath_s(argv[0], NULL, 0, NULL, 0, fname, _MAX_FNAME,
                                   ext, _MAX_EXT);
        if (err == 0)
        { // no error
            strcat(fname, ext); // just cat them, size of frame can keep both
            strncpy(appName, fname, sizeof(appName));
            appName[sizeof(appName) - 1] = '\0';
        }
    }
#endif

    vlog("\n%s", appName);
    for (i = 1; i < argc; i++)
    {
        const char *arg = argv[i];
        if (NULL == arg) break;

        vlog("\t%s", arg);
        if (arg[0] == '-')
        {
            arg++;
            while (*arg != '\0')
            {
                switch (*arg)
                {
                    case 'd': gTestDouble ^= 1; break;
                    case 'l': gSkipTesting ^= 1; break;
                    case 'm': gMultithread ^= 1; break;
                    case 'w': gWimpyMode ^= 1; break;
                    case '[':
                        parseWimpyReductionFactor(arg, gWimpyReductionFactor);
                        break;
                    case 'z': gForceFTZ ^= 1; break;
                    case 't': gTimeResults ^= 1; break;
                    case 'a': gReportAverageTimes ^= 1; break;
                    case '1':
                        if (arg[1] == '6')
                        {
                            gMinVectorSize = 6;
                            gMaxVectorSize = 7;
                            arg++;
                        }
                        else
                        {
                            gMinVectorSize = 0;
                            gMaxVectorSize = 2;
                        }
                        break;

                    case '2':
                        gMinVectorSize = 2;
                        gMaxVectorSize = 3;
                        break;

                    case '3':
                        gMinVectorSize = 3;
                        gMaxVectorSize = 4;
                        break;

                    case '4':
                        gMinVectorSize = 4;
                        gMaxVectorSize = 5;
                        break;

                    case '8':
                        gMinVectorSize = 5;
                        gMaxVectorSize = 6;
                        break;

                    default:
                        vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg);
                        PrintUsage();
                        return -1;
                }
                arg++;
            }
        }
        else
        {
            char *t = NULL;
            long number = strtol(arg, &t, 0);
            if (t != arg)
            {
                if (gStartTestNumber != -1)
                    gEndTestNumber = gStartTestNumber + (int)number;
                else
                    gStartTestNumber = (int)number;
            }
            else
            {
                argList[argCount] = arg;
                argCount++;
            }
        }
    }

    // Check for the wimpy mode environment variable
    if (getenv("CL_WIMPY_MODE"))
    {
        vlog("\n");
        vlog("*** Detected CL_WIMPY_MODE env                          ***\n");
        gWimpyMode = 1;
    }

    vlog( "\n" );

    PrintArch();

    if (gWimpyMode)
    {
        vlog("\n");
        vlog("*** WARNING: Testing in Wimpy mode!                     ***\n");
        vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n");
        vlog("*** It gives warm fuzzy feelings and then nevers calls. ***\n\n");
        vlog("*** Wimpy Reduction Factor: %-27u ***\n\n",
             gWimpyReductionFactor);
    }

    return 0;
}

static void PrintUsage(void)
{
    int i;
    vlog("%s [-wz#]: <optional: test names>\n", appName);
    vlog("\ttest names:\n");
    vlog("\t\tdestFormat<_sat><_round>_sourceFormat\n");
    vlog("\t\t\tPossible format types are:\n\t\t\t\t");
    for (i = 0; i < kTypeCount; i++) vlog("%s, ", gTypeNames[i]);
    vlog("\n\n\t\t\tPossible saturation values are: (empty) and _sat\n");
    vlog("\t\t\tPossible rounding values are:\n\t\t\t\t(empty), ");
    for (i = 1; i < kRoundingModeCount; i++)
        vlog("%s, ", gRoundingModeNames[i]);
    vlog("\n\t\t\tExamples:\n");
    vlog("\t\t\t\tulong_short   converts short to ulong\n");
    vlog("\t\t\t\tchar_sat_rte_float   converts float to char with saturated "
         "clipping in round to nearest rounding mode\n\n");
    vlog("\toptions:\n");
    vlog("\t\t-d\tToggle testing of double precision.  On by default if "
         "cl_khr_fp64 is enabled, ignored otherwise.\n");
    vlog("\t\t-l\tToggle link check mode. When on, testing is skipped, and we "
         "just check to see that the kernels build. (Off by default.)\n");
    vlog("\t\t-m\tToggle Multithreading. (On by default.)\n");
    vlog("\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very "
         "small subset of the tests for each fn. NOT A VALID TEST! (Off by "
         "default.)\n");
    vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
         "1-12, default factor(%u)\n",
         gWimpyReductionFactor);
    vlog("\t\t-z\tToggle flush to zero mode  (Default: per device)\n");
    vlog("\t\t-#\tTest just vector size given by #, where # is an element of "
         "the set {1,2,3,4,8,16}\n");
    vlog("\n");
    vlog(
        "You may also pass the number of the test on which to start.\nA second "
        "number can be then passed to indicate how many tests to run\n\n");
}


static int GetTestCase(const char *name, Type *outType, Type *inType,
                       SaturationMode *sat, RoundingMode *round)
{
    int i;

    // Find the return type
    for (i = 0; i < kTypeCount; i++)
        if (name == strstr(name, gTypeNames[i]))
        {
            *outType = (Type)i;
            name += strlen(gTypeNames[i]);

            break;
        }

    if (i == kTypeCount) return -1;

    // Check to see if _sat appears next
    *sat = (SaturationMode)0;
    for (i = 1; i < kSaturationModeCount; i++)
        if (name == strstr(name, gSaturationNames[i]))
        {
            *sat = (SaturationMode)i;
            name += strlen(gSaturationNames[i]);
            break;
        }

    *round = (RoundingMode)0;
    for (i = 1; i < kRoundingModeCount; i++)
        if (name == strstr(name, gRoundingModeNames[i]))
        {
            *round = (RoundingMode)i;
            name += strlen(gRoundingModeNames[i]);
            break;
        }

    if (*name != '_') return -2;
    name++;

    for (i = 0; i < kTypeCount; i++)
        if (name == strstr(name, gTypeNames[i]))
        {
            *inType = (Type)i;
            name += strlen(gTypeNames[i]);

            break;
        }

    if (i == kTypeCount) return -3;

    if (*name != '\0') return -4;

    return 0;
}

#pragma mark -
#pragma mark OpenCL

test_status InitCL(cl_device_id device)
{
    int error, i;
    size_t configSize = sizeof(gComputeDevices);

    if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
                                 configSize, &gComputeDevices, NULL)))
        gComputeDevices = 1;

    configSize = sizeof(gDeviceFrequency);
    if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
                                 configSize, &gDeviceFrequency, NULL)))
        gDeviceFrequency = 0;

    cl_device_fp_config floatCapabilities = 0;
    if ((error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG,
                                 sizeof(floatCapabilities), &floatCapabilities,
                                 NULL)))
        floatCapabilities = 0;
    if (0 == (CL_FP_DENORM & floatCapabilities)) gForceFTZ ^= 1;

    if (0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST))
    {
        char profileStr[128] = "";
        // Verify that we are an embedded profile device
        if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE,
                                     sizeof(profileStr), profileStr, NULL)))
        {
            vlog_error("FAILURE: Could not get device profile: error %d\n",
                       error);
            return TEST_FAIL;
        }

        if (strcmp(profileStr, "EMBEDDED_PROFILE"))
        {
            vlog_error("FAILURE: non-embedded profile device does not support "
                       "CL_FP_ROUND_TO_NEAREST\n");
            return TEST_FAIL;
        }

        if (0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO))
        {
            vlog_error("FAILURE: embedded profile device supports neither "
                       "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n");
            return TEST_FAIL;
        }

        gIsRTZ = 1;
    }

    else if (is_extension_available(device, "cl_khr_fp64"))
    {
        gHasDouble = 1;
    }
    gTestDouble &= gHasDouble;

    // detect whether profile of the device is embedded
    char profile[1024] = "";
    if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
                                 profile, NULL)))
    {
    }
    else if (strstr(profile, "EMBEDDED_PROFILE"))
    {
        gIsEmbedded = 1;
        if (!is_extension_available(device, "cles_khr_int64")) gHasLong = 0;
    }


    gContext = clCreateContext(NULL, 1, &device, notify_callback, NULL, &error);
    if (NULL == gContext || error)
    {
        vlog_error("clCreateContext failed. (%d)\n", error);
        return TEST_FAIL;
    }

    gQueue = clCreateCommandQueue(gContext, device, 0, &error);
    if (NULL == gQueue || error)
    {
        vlog_error("clCreateCommandQueue failed. (%d)\n", error);
        return TEST_FAIL;
    }

    // Allocate buffers
    // FIXME: use clProtectedArray for guarded allocations?
    gIn = malloc(BUFFER_SIZE + 2 * kPageSize);
    gAllowZ = malloc(BUFFER_SIZE + 2 * kPageSize);
    gRef = malloc(BUFFER_SIZE + 2 * kPageSize);
    for (i = 0; i < kCallStyleCount; i++)
    {
        gOut[i] = malloc(BUFFER_SIZE + 2 * kPageSize);
        if (NULL == gOut[i]) return TEST_FAIL;
    }

    // setup input buffers
    gInBuffer =
        clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                       BUFFER_SIZE, NULL, &error);
    if (gInBuffer == NULL || error)
    {
        vlog_error("clCreateBuffer failed for input (%d)\n", error);
        return TEST_FAIL;
    }

    // setup output buffers
    for (i = 0; i < kCallStyleCount; i++)
    {
        gOutBuffers[i] =
            clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                           BUFFER_SIZE, NULL, &error);
        if (gOutBuffers[i] == NULL || error)
        {
            vlog_error("clCreateArray failed for output (%d)\n", error);
            return TEST_FAIL;
        }
    }


    gMTdata = init_genrand(gRandomSeed);


    char c[1024];
    static const char *no_yes[] = { "NO", "YES" };
    vlog("\nCompute Device info:\n");
    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL);
    vlog("\tDevice Name: %s\n", c);
    clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
    vlog("\tVendor: %s\n", c);
    clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL);
    vlog("\tDevice Version: %s\n", c);
    clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
    vlog("\tCL C Version: %s\n", c);
    clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL);
    vlog("\tDriver Version: %s\n", c);
    vlog("\tProcessing with %ld devices\n", gComputeDevices);
    vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency);
    vlog("\tSubnormal values supported for floats? %s\n",
         no_yes[0 != (CL_FP_DENORM & floatCapabilities)]);
    vlog("\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ]);
    vlog("\tTesting with default RTZ mode for floats? %s\n",
         no_yes[0 != gIsRTZ]);
    vlog("\tHas Double? %s\n", no_yes[0 != gHasDouble]);
    if (gHasDouble) vlog("\tTest Double? %s\n", no_yes[0 != gTestDouble]);
    vlog("\tHas Long? %s\n", no_yes[0 != gHasLong]);
    vlog("\tTesting vector sizes: ");
    for (i = gMinVectorSize; i < gMaxVectorSize; i++)
        vlog("\t%d", vectorSizes[i]);
    vlog("\n");
    return TEST_PASS;
}

static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
                     size_t blockCount)
{
    // The global dimensions are just the blockCount to execute since we haven't
    // set up multiple queues for multiple devices.
    int error;

    error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
    error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);

    if (error)
    {
        vlog_error("FAILED -- could not set kernel args (%d)\n", error);
        return error;
    }

    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
                                        NULL, 0, NULL, NULL)))
    {
        vlog_error("FAILED -- could not execute kernel (%d)\n", error);
        return error;
    }

    return 0;
}

#if !defined(__APPLE__)
void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
#endif

#if defined(__APPLE__)
#include <mach/mach_time.h>
#endif

uint64_t GetTime(void);
uint64_t GetTime(void)
{
#if defined(__APPLE__)
    return mach_absolute_time();
#elif defined(_MSC_VER)
    return ReadTime();
#else
    // mach_absolute_time is a high precision timer with precision < 1
    // microsecond.
#warning need accurate clock here.  Times are invalid.
    return 0;
#endif
}


#if defined(_MSC_VER)
/* function is defined in "compat.h" */
#else
double SubtractTime(uint64_t endTime, uint64_t startTime);
double SubtractTime(uint64_t endTime, uint64_t startTime)
{
    uint64_t diff = endTime - startTime;
    static double conversion = 0.0;

    if (0.0 == conversion)
    {
#if defined(__APPLE__)
        mach_timebase_info_data_t info = { 0, 0 };
        kern_return_t err = mach_timebase_info(&info);
        if (0 == err)
            conversion = 1e-9 * (double)info.numer / (double)info.denom;
#else
        // This function consumes output from GetTime() above, and converts the
        // time to secionds.
#warning need accurate ticks to seconds conversion factor here. Times are invalid.
#endif
    }

    // strictly speaking we should also be subtracting out timer latency here
    return conversion * (double)diff;
}
#endif

typedef struct CalcReferenceValuesInfo
{
    struct WriteInputBufferInfo
        *parent; // pointer back to the parent WriteInputBufferInfo struct
    cl_kernel kernel; // the kernel for this vector size
    cl_program program; // the program for this vector size
    cl_uint vectorSize; // the vector size for this callback chain
    void *p; // the pointer to mapped result data for this vector size
    cl_int result;
} CalcReferenceValuesInfo;

typedef struct WriteInputBufferInfo
{
    volatile cl_event
        calcReferenceValues; // user event which signals when main thread is
                             // done calculating reference values
    volatile cl_event
        doneBarrier; // user event which signals when worker threads are done
    cl_uint count; // the number of elements in the array
    Type outType; // the data type of the conversion result
    Type inType; // the data type of the conversion input
    volatile int barrierCount;
    CalcReferenceValuesInfo calcInfo[kCallStyleCount];
} WriteInputBufferInfo;

cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
{
    if (0 == (x & (x - 1))) return x;

    while (x & (x - 1)) x &= x - 1;

    return x + x;
}

void WriteInputBufferComplete(void *);

typedef struct DataInitInfo
{
    cl_ulong start;
    cl_uint size;
    Type outType;
    Type inType;
    SaturationMode sat;
    RoundingMode round;
    MTdata *d;
} DataInitInfo;

cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
{
    DataInitInfo *info = (DataInitInfo *)p;

    gInitFunctions[info->inType](
        (char *)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat,
        info->round, info->outType, info->start + job_id * info->size,
        info->size, info->d[thread_id]);
    return CL_SUCCESS;
}

static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
{
    cl_uint i;
    for (i = 0; i < count; ++i)
        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
}

cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
{
    DataInitInfo *info = (DataInitInfo *)p;
    cl_uint count = info->size;
    Type inType = info->inType;
    Type outType = info->outType;
    RoundingMode round = info->round;
    size_t j;

    Force64BitFPUPrecision();

    void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
    void *a = (cl_uchar *)gAllowZ + job_id * count;
    void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];

    if (outType != inType)
    {
        // create the reference while we wait
        Convert f = gConversions[outType][inType];
        if (info->sat) f = gSaturatedConversions[outType][inType];

#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
        /* ARM VFP doesn't have hardware instruction for converting from 64-bit
         * integer to float types, hence GCC ARM uses the floating-point
         * emulation code despite which -mfloat-abi setting it is. But the
         * emulation code in libgcc.a has only one rounding mode (round to
         * nearest even in this case) and ignores the user rounding mode setting
         * in hardware. As a result setting rounding modes in hardware won't
         * give correct rounding results for type covert from 64-bit integer to
         * float using GCC for ARM compiler so for testing different rounding
         * modes, we need to use alternative reference function. ARM64 does have
         * an instruction, however we cannot guarantee the compiler will use it.
         * On all ARM architechures use emulation to calculate reference.*/
        switch (round)
        {
            /* conversions to floating-point type use the current rounding mode.
             * The only default floating-point rounding mode supported is round
             * to nearest even i.e the current rounding mode will be _rte for
             * floating-point types. */
            case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
            case kRoundToNearestEven: qcom_rm = qcomRTE; break;
            case kRoundUp: qcom_rm = qcomRTP; break;
            case kRoundDown: qcom_rm = qcomRTN; break;
            case kRoundTowardZero: qcom_rm = qcomRTZ; break;
            default:
                vlog_error("ERROR: undefined rounding mode %d\n", round);
                break;
        }
        qcom_sat = info->sat;
#endif

        RoundingMode oldRound = set_round(round, outType);
        f(d, s, count);
        set_round(oldRound, outType);

        // Decide if we allow a zero result in addition to the correctly rounded
        // one
        memset(a, 0, count);
        if (gForceFTZ)
        {
            if (inType == kfloat) setAllowZ((uint8_t *)a, (uint32_t *)s, count);
            if (outType == kfloat)
                setAllowZ((uint8_t *)a, (uint32_t *)d, count);
        }
    }
    else
    {
        // Copy the input to the reference
        memcpy(d, s, info->size * gTypeSizes[inType]);
    }

    // Patch up NaNs conversions to integer to zero -- these can be converted to
    // any integer
    if (info->outType != kfloat && info->outType != kdouble)
    {
        if (inType == kfloat)
        {
            float *inp = (float *)s;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j]))
                    memset((char *)d + j * gTypeSizes[outType], 0,
                           gTypeSizes[outType]);
            }
        }
        if (inType == kdouble)
        {
            double *inp = (double *)s;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j]))
                    memset((char *)d + j * gTypeSizes[outType], 0,
                           gTypeSizes[outType]);
            }
        }
    }
    else if (inType == kfloat || inType == kdouble)
    { // outtype and intype is float or double.  NaN conversions for float <->
      // double can be any NaN
        if (inType == kfloat && outType == kdouble)
        {
            float *inp = (float *)s;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j])) ((double *)d)[j] = NAN;
            }
        }
        if (inType == kdouble && outType == kfloat)
        {
            double *inp = (double *)s;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j])) ((float *)d)[j] = NAN;
            }
        }
    }

    return CL_SUCCESS;
}

static int DoTest(cl_device_id device, Type outType, Type inType,
                  SaturationMode sat, RoundingMode round, MTdata d)
{
#ifdef __APPLE__
    cl_ulong wall_start = mach_absolute_time();
#endif

    DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
    WriteInputBufferInfo writeInputBufferInfo;
    int vectorSize;
    int error = 0;
    cl_uint threads = GetThreadCount();
    uint64_t i;

    gTestCount++;
    size_t blockCount =
        BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
    size_t step = blockCount;
    uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);

    memset(&writeInputBufferInfo, 0, sizeof(writeInputBufferInfo));
    init_info.d = (MTdata *)malloc(threads * sizeof(MTdata));
    if (NULL == init_info.d)
    {
        vlog_error(
            "ERROR: Unable to allocate storage for random number generator!\n");
        return -1;
    }
    for (i = 0; i < threads; i++)
    {
        init_info.d[i] = init_genrand(genrand_int32(d));
        if (NULL == init_info.d[i])
        {
            vlog_error("ERROR: Unable to allocate storage for random number "
                       "generator!\n");
            return -1;
        }
    }

    writeInputBufferInfo.outType = outType;
    writeInputBufferInfo.inType = inType;

    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        writeInputBufferInfo.calcInfo[vectorSize].program =
            MakeProgram(outType, inType, sat, round, vectorSize,
                        &writeInputBufferInfo.calcInfo[vectorSize].kernel);
        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].program)
        {
            gFailCount++;
            return -1;
        }
        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel)
        {
            gFailCount++;
            vlog_error("\t\tFAILED -- Failed to create kernel.\n");
            return -2;
        }

        writeInputBufferInfo.calcInfo[vectorSize].parent =
            &writeInputBufferInfo;
        writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
        writeInputBufferInfo.calcInfo[vectorSize].result = -1;
    }

    if (gSkipTesting) goto exit;

    // Patch up rounding mode if default is RTZ
    // We leave the part above in default rounding mode so that the right kernel
    // is compiled.
    if (round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat))
        init_info.round = round = kRoundTowardZero;

    // Figure out how many elements are in a work block

    // we handle 64-bit types a bit differently.
    if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;

    if (!gWimpyMode && gIsEmbedded)
        step = blockCount * EMBEDDED_REDUCTION_FACTOR;

    if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
    vlog("Testing... ");
    fflush(stdout);
    for (i = 0; i < (uint64_t)lastCase; i += step)
    {

        if (0 == (i & ((lastCase >> 3) - 1)))
        {
            vlog(".");
            fflush(stdout);
        }

        cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
        writeInputBufferInfo.count = count;

        // Crate a user event to represent the status of the reference value
        // computation completion
        writeInputBufferInfo.calcReferenceValues =
            clCreateUserEvent(gContext, &error);
        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
        {
            vlog_error("ERROR: Unable to create user event. (%d)\n", error);
            gFailCount++;
            goto exit;
        }

        // retain for consumption by MapOutputBufferComplete
        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
             vectorSize++)
        {
            if ((error =
                     clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
            {
                vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
                gFailCount++;
                goto exit;
            }
        }

        // Crate a user event to represent when the callbacks are done verifying
        // correctness
        writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
        {
            vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
                       error);
            gFailCount++;
            goto exit;
        }

        // retain for use by the callback that calls this
        if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
        {
            vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
                       error);
            gFailCount++;
            goto exit;
        }

        //      Call this in a multithreaded manner
        //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d
        //      );
        cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
        init_info.start = i;
        init_info.size = count / chunks;
        if (init_info.size < 16384)
        {
            chunks = RoundUpToNextPowerOfTwo(threads);
            init_info.size = count / chunks;
            if (init_info.size < 16384)
            {
                init_info.size = count;
                chunks = 1;
            }
        }
        ThreadPool_Do(InitData, chunks, &init_info);

        // Copy the results to the device
        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
                                          count * gTypeSizes[inType], gIn, 0,
                                          NULL, NULL)))
        {
            vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
            gFailCount++;
            goto exit;
        }

        // Call completion callback for the write, which will enqueue the rest
        // of the work.
        WriteInputBufferComplete((void *)&writeInputBufferInfo);

        // Make sure the work is actually running, so we don't deadlock
        if ((error = clFlush(gQueue)))
        {
            vlog_error("clFlush failed with error %d\n", error);
            gFailCount++;
            goto exit;
        }

        ThreadPool_Do(PrepareReference, chunks, &init_info);

        // signal we are done calculating the reference results
        if ((error = clSetUserEventStatus(
                 writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
        {
            vlog_error(
                "Error:  Failed to set user event status to CL_COMPLETE:  %d\n",
                error);
            gFailCount++;
            goto exit;
        }

        // Wait for the event callbacks to finish verifying correctness.
        if ((error = clWaitForEvents(
                 1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
        {
            vlog_error("Error:  Failed to wait for barrier:  %d\n", error);
            gFailCount++;
            goto exit;
        }

        if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
        {
            vlog_error("Error:  Failed to release calcReferenceValues:  %d\n",
                       error);
            gFailCount++;
            goto exit;
        }

        if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
        {
            vlog_error("Error:  Failed to release done barrier:  %d\n", error);
            gFailCount++;
            goto exit;
        }


        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
             vectorSize++)
        {
            if ((error = writeInputBufferInfo.calcInfo[vectorSize].result))
            {
                switch (inType)
                {
                    case kuchar:
                    case kchar:
                        vlog("Input value: 0x%2.2x ",
                             ((unsigned char *)gIn)[error - 1]);
                        break;
                    case kushort:
                    case kshort:
                        vlog("Input value: 0x%4.4x ",
                             ((unsigned short *)gIn)[error - 1]);
                        break;
                    case kuint:
                    case kint:
                        vlog("Input value: 0x%8.8x ",
                             ((unsigned int *)gIn)[error - 1]);
                        break;
                    case kfloat:
                        vlog("Input value: %a ", ((float *)gIn)[error - 1]);
                        break;
                        break;
                    case kulong:
                    case klong:
                        vlog("Input value: 0x%16.16llx ",
                             ((unsigned long long *)gIn)[error - 1]);
                        break;
                    case kdouble:
                        vlog("Input value: %a ", ((double *)gIn)[error - 1]);
                        break;
                    default:
                        vlog_error("Internal error at %s: %d\n", __FILE__,
                                   __LINE__);
                        abort();
                        break;
                }

                // tell the user which conversion it was.
                if (0 == vectorSize)
                    vlog(" (implicit scalar conversion from %s to %s)\n",
                         gTypeNames[inType], gTypeNames[outType]);
                else
                    vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
                         sizeNames[vectorSize], gSaturationNames[sat],
                         gRoundingModeNames[round], gTypeNames[inType],
                         sizeNames[vectorSize]);

                gFailCount++;
                goto exit;
            }
        }
    }

    log_info("done.\n");

    if (gTimeResults)
    {
        // Kick off tests for the various vector lengths
        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
             vectorSize++)
        {
            size_t workItemCount = blockCount / vectorSizes[vectorSize];
            if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
                workItemCount /=
                    4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);

            double sum = 0.0;
            double bestTime = INFINITY;
            cl_uint k;
            for (k = 0; k < PERF_LOOP_COUNT; k++)
            {
                uint64_t startTime = GetTime();
                if ((error = RunKernel(
                         writeInputBufferInfo.calcInfo[vectorSize].kernel,
                         gInBuffer, gOutBuffers[vectorSize], workItemCount)))
                {
                    gFailCount++;
                    goto exit;
                }

                // Make sure OpenCL is done
                if ((error = clFinish(gQueue)))
                {
                    vlog_error("Error %d at clFinish\n", error);
                    goto exit;
                }

                uint64_t endTime = GetTime();
                double time = SubtractTime(endTime, startTime);
                sum += time;
                if (time < bestTime) bestTime = time;
            }

            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
            double clocksPerOp = bestTime * (double)gDeviceFrequency
                * gComputeDevices * gSimdSize * 1e6
                / (workItemCount * vectorSizes[vectorSize]);
            if (0 == vectorSize)
                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
                          "implicit convert %s -> %s", gTypeNames[inType],
                          gTypeNames[outType]);
            else
                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
                          "convert_%s%s%s%s( %s%s )", gTypeNames[outType],
                          sizeNames[vectorSize], gSaturationNames[sat],
                          gRoundingModeNames[round], gTypeNames[inType],
                          sizeNames[vectorSize]);
        }
    }

    if (gWimpyMode)
        vlog("\tWimp pass");
    else
        vlog("\tpassed");

#ifdef __APPLE__
    // record the run time
    vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
#endif
    vlog("\n\n");
    fflush(stdout);


exit:
    // clean up
    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        clReleaseProgram(writeInputBufferInfo.calcInfo[vectorSize].program);
        clReleaseKernel(writeInputBufferInfo.calcInfo[vectorSize].kernel);
    }

    if (init_info.d)
    {
        for (i = 0; i < threads; i++) free_mtdata(init_info.d[i]);
        free(init_info.d);
    }

    return error;
}

void MapResultValuesComplete(void *data);

// Note: not called reentrantly
void WriteInputBufferComplete(void *data)
{
    cl_int status;
    WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
    cl_uint count = info->count;
    int vectorSize;

    info->barrierCount = gMaxVectorSize - gMinVectorSize;

    // now that we know that the write buffer is complete, enqueue callbacks to
    // wait for the main thread to finish calculating the reference results.
    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        size_t workItemCount =
            (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);

        if ((status = RunKernel(info->calcInfo[vectorSize].kernel, gInBuffer,
                                gOutBuffers[vectorSize], workItemCount)))
        {
            gFailCount++;
            return;
        }

        info->calcInfo[vectorSize].p = clEnqueueMapBuffer(
            gQueue, gOutBuffers[vectorSize], CL_TRUE,
            CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
            NULL, NULL, &status);
        {
            if (status)
            {
                vlog_error("ERROR: WriteInputBufferComplete calback failed "
                           "with status: %d\n",
                           status);
                gFailCount++;
                return;
            }
        }
    }

    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        MapResultValuesComplete(info->calcInfo + vectorSize);
    }

    // Make sure the work starts moving -- otherwise we may deadlock
    if ((status = clFlush(gQueue)))
    {
        vlog_error(
            "ERROR: WriteInputBufferComplete calback failed with status: %d\n",
            status);
        gFailCount++;
        return;
    }

    // e was already released by the main thread. It should be destroyed
    // automatically soon after we exit.
}

void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
                                             void *data);

// Note: May be called reentrantly
void MapResultValuesComplete(void *data)
{
    cl_int status;
    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
    cl_event calcReferenceValues = info->parent->calcReferenceValues;

    // we know that the map is done, wait for the main thread to finish
    // calculating the reference values
    if ((status = clSetEventCallback(calcReferenceValues, CL_COMPLETE,
                                     CalcReferenceValuesComplete, data)))
    {
        vlog_error("ERROR: clSetEventCallback failed in "
                   "MapResultValuesComplete with status: %d\n",
                   status);
        gFailCount++; // not thread safe -- being lazy here
    }

    // this thread no longer needs its reference to info->calcReferenceValues,
    // so release it
    if ((status = clReleaseEvent(calcReferenceValues)))
    {
        vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
                   "with status: %d\n",
                   status);
        gFailCount++; // not thread safe -- being lazy here
    }

    // no need to flush since we didn't enqueue anything

    // e was already released by WriteInputBufferComplete. It should be
    // destroyed automatically soon after we exit.
}


void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
                                             void *data)
{
    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
    cl_uint vectorSize = info->vectorSize;
    cl_uint count = info->parent->count;
    Type outType =
        info->parent->outType; // the data type of the conversion result
    Type inType = info->parent->inType; // the data type of the conversion input
    size_t j;
    cl_int error;
    cl_event doneBarrier = info->parent->doneBarrier;

    // report spurious error condition
    if (CL_SUCCESS != status)
    {
        vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
                   status);
        gFailCount++; // lazy about thread safety here
        return;
    }

    // Now we know that both results have been mapped back from the device, and
    // the main thread is done calculating the reference results. It is now time
    // to check the results.

    // verify results
    void *mapped = info->p;

    // Patch up NaNs conversions to integer to zero -- these can be converted to
    // any integer
    if (outType != kfloat && outType != kdouble)
    {
        if (inType == kfloat)
        {
            float *inp = (float *)gIn;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j]))
                    memset((char *)mapped + j * gTypeSizes[outType], 0,
                           gTypeSizes[outType]);
            }
        }
        if (inType == kdouble)
        {
            double *inp = (double *)gIn;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j]))
                    memset((char *)mapped + j * gTypeSizes[outType], 0,
                           gTypeSizes[outType]);
            }
        }
    }
    else if (inType == kfloat || inType == kdouble)
    { // outtype and intype is float or double.  NaN conversions for float <->
      // double can be any NaN
        if (inType == kfloat && outType == kdouble)
        {
            float *inp = (float *)gIn;
            double *outp = (double *)mapped;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
            }
        }
        if (inType == kdouble && outType == kfloat)
        {
            double *inp = (double *)gIn;
            float *outp = (float *)mapped;
            for (j = 0; j < count; j++)
            {
                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
            }
        }
    }

    if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
        info->result = gCheckResults[outType](mapped, gRef, gAllowZ, count,
                                              vectorSizes[vectorSize]);
    else
        info->result = 0;

    // Fill the output buffer with junk and release it
    {
        cl_uint pattern = 0xffffdead;
        memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
        if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
                                             mapped, 0, NULL, NULL)))
        {
            vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
                       "CalcReferenceValuesComplete  (%d)\n",
                       error);
            gFailCount++;
        }
    }

    if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
    {
        if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
        {
            vlog_error("ERROR: clSetUserEventStatus failed in "
                       "CalcReferenceValuesComplete (err: %d). We're probably "
                       "going to deadlock.\n",
                       status);
            gFailCount++;
            return;
        }

        if ((status = clReleaseEvent(doneBarrier)))
        {
            vlog_error("ERROR: clReleaseEvent failed in "
                       "CalcReferenceValuesComplete (err: %d).\n",
                       status);
            gFailCount++;
            return;
        }
    }
    // e was already released by WriteInputBufferComplete. It should be
    // destroyed automatically soon after all the calls to
    // CalcReferenceValuesComplete exit.
}

static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
                              RoundingMode round, int vectorSize,
                              cl_kernel *outKernel)
{
    cl_program program;
    char testName[256];
    int error = 0;

    std::ostringstream source;
    if (outType == kdouble || inType == kdouble)
        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";

    // Create the program. This is a bit complicated because we are trying to
    // avoid byte and short stores.
    if (0 == vectorSize)
    {
        // Create the type names.
        char inName[32];
        char outName[32];
        strncpy(inName, gTypeNames[inType], sizeof(inName));
        strncpy(outName, gTypeNames[outType], sizeof(outName));
        sprintf(testName, "test_implicit_%s_%s", outName, inName);

        source << "__kernel void " << testName << "( __global " << inName
               << " *src, __global " << outName << " *dest )\n";
        source << "{\n";
        source << "   size_t i = get_global_id(0);\n";
        source << "   dest[i] =  src[i];\n";
        source << "}\n";

        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
             gTypeNames[outType]);
        fflush(stdout);
    }
    else
    {
        int vectorSizetmp = vectorSizes[vectorSize];

        // Create the type names.
        char convertString[128];
        char inName[32];
        char outName[32];
        switch (vectorSizetmp)
        {
            case 1:
                strncpy(inName, gTypeNames[inType], sizeof(inName));
                strncpy(outName, gTypeNames[outType], sizeof(outName));
                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
                         outName, gSaturationNames[sat],
                         gRoundingModeNames[round]);
                snprintf(testName, 256, "test_%s_%s", convertString, inName);
                vlog("Building %s( %s ) test\n", convertString, inName);
                break;
            case 3:
                strncpy(inName, gTypeNames[inType], sizeof(inName));
                strncpy(outName, gTypeNames[outType], sizeof(outName));
                snprintf(convertString, sizeof(convertString),
                         "convert_%s3%s%s", outName, gSaturationNames[sat],
                         gRoundingModeNames[round]);
                snprintf(testName, 256, "test_%s_%s3", convertString, inName);
                vlog("Building %s( %s3 ) test\n", convertString, inName);
                break;
            default:
                snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
                         vectorSizetmp);
                snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
                         vectorSizetmp);
                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
                         outName, gSaturationNames[sat],
                         gRoundingModeNames[round]);
                snprintf(testName, 256, "test_%s_%s", convertString, inName);
                vlog("Building %s( %s ) test\n", convertString, inName);
                break;
        }
        fflush(stdout);

        if (vectorSizetmp == 3)
        {
            source << "__kernel void " << testName << "( __global " << inName
                   << " *src, __global " << outName << " *dest )\n";
            source << "{\n";
            source << "   size_t i = get_global_id(0);\n";
            source << "   if( i + 1 < get_global_size(0))\n";
            source << "       vstore3( " << convertString
                   << "( vload3( i, src)), i, dest );\n";
            source << "   else\n";
            source << "   {\n";
            source << "       " << inName << "3 in;\n";
            source << "       " << outName << "3 out;\n";
            source << "       if( 0 == (i & 1) )\n";
            source << "           in.y = src[3*i+1];\n";
            source << "       in.x = src[3*i];\n";
            source << "       out = " << convertString << "( in ); \n";
            source << "       dest[3*i] = out.x;\n";
            source << "       if( 0 == (i & 1) )\n";
            source << "           dest[3*i+1] = out.y;\n";
            source << "   }\n";
            source << "}\n";
        }
        else
        {
            source << "__kernel void " << testName << "( __global " << inName
                   << " *src, __global " << outName << " *dest )\n";
            source << "{\n";
            source << "   size_t i = get_global_id(0);\n";
            source << "   dest[i] = " << convertString << "( src[i] );\n";
            source << "}\n";
        }
    }
    *outKernel = NULL;

    const char *flags = NULL;
    if (gForceFTZ) flags = "-cl-denorms-are-zero";

    // build it
    std::string sourceString = source.str();
    const char *programSource = sourceString.c_str();
    error = create_single_kernel_helper(gContext, &program, outKernel, 1,
                                        &programSource, testName, flags);
    if (error)
    {
        vlog_error("Failed to build kernel/program (err = %d).\n", error);
        clReleaseProgram(program);
        return NULL;
    }

    return program;
}