mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
* grab latest from upstream OpenCL * Removed events for host to device data transfers * grab latest from upstream OpenCL * 1.) revert changes to CMakeLists.txt and run_batch script in hopes this will solve the CI issues for the PR. 2.) resolve the merge conflict in test_conversions.cpp * 1.) resolve 2 additional merge conflicts * 1.) resolve 1 additional merge conflicts * locally this fails clang-format but CI version seems to require it. * fix the warning. * Remove now-unused event Also, cut down a comment that is no longer accurate. Co-authored-by: Joshua Luceno <joshualuceno@gmail.com> Co-authored-by: Chip Davis <chip@holochip.com> Co-authored-by: Chip Davis <cdavis5x@gmail.com>
1754 lines
58 KiB
C++
1754 lines
58 KiB
C++
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include "harness/compat.h"
|
|
#include "harness/rounding_mode.h"
|
|
#include "harness/ThreadPool.h"
|
|
#include "harness/testHarness.h"
|
|
#include "harness/kernelHelpers.h"
|
|
#include "harness/parseParameters.h"
|
|
#if defined(__APPLE__)
|
|
#include <sys/sysctl.h>
|
|
#endif
|
|
|
|
#if defined(__linux__)
|
|
#include <unistd.h>
|
|
#include <sys/syscall.h>
|
|
#include <linux/sysctl.h>
|
|
#endif
|
|
#if defined(__linux__)
|
|
#include <sys/param.h>
|
|
#include <libgen.h>
|
|
#endif
|
|
|
|
#include "mingw_compat.h"
|
|
#if defined(__MINGW32__)
|
|
#include <sys/param.h>
|
|
#endif
|
|
|
|
#include <sstream>
|
|
#include <stdarg.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#if !defined(_WIN32)
|
|
#include <libgen.h>
|
|
#include <sys/mman.h>
|
|
#endif
|
|
#include <time.h>
|
|
|
|
#include <algorithm>
|
|
|
|
#include "Sleep.h"
|
|
#include "basic_test_conversions.h"
|
|
|
|
#if (defined(_WIN32) && defined(_MSC_VER))
|
|
// need for _controlfp_s and rouinding modes in RoundingMode
|
|
#include "harness/testHarness.h"
|
|
#endif
|
|
|
|
#pragma mark -
|
|
#pragma mark globals
|
|
|
|
#define BUFFER_SIZE (1024 * 1024)
|
|
#define kPageSize 4096
|
|
#define EMBEDDED_REDUCTION_FACTOR 16
|
|
#define PERF_LOOP_COUNT 100
|
|
|
|
#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
|
|
|
|
#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
|
|
#include "fplib.h"
|
|
extern bool qcom_sat;
|
|
extern roundingMode qcom_rm;
|
|
#endif
|
|
|
|
const char **argList = NULL;
|
|
int argCount = 0;
|
|
cl_context gContext = NULL;
|
|
cl_command_queue gQueue = NULL;
|
|
char appName[64] = "ctest";
|
|
int gStartTestNumber = -1;
|
|
int gEndTestNumber = 0;
|
|
#if defined(__APPLE__)
|
|
int gTimeResults = 1;
|
|
#else
|
|
int gTimeResults = 0;
|
|
#endif
|
|
int gReportAverageTimes = 0;
|
|
void *gIn = NULL;
|
|
void *gRef = NULL;
|
|
void *gAllowZ = NULL;
|
|
void *gOut[kCallStyleCount] = { NULL };
|
|
cl_mem gInBuffer;
|
|
cl_mem gOutBuffers[kCallStyleCount];
|
|
size_t gComputeDevices = 0;
|
|
uint32_t gDeviceFrequency = 0;
|
|
int gWimpyMode = 0;
|
|
int gWimpyReductionFactor = 128;
|
|
int gSkipTesting = 0;
|
|
int gForceFTZ = 0;
|
|
int gMultithread = 1;
|
|
int gIsRTZ = 0;
|
|
uint32_t gSimdSize = 1;
|
|
int gHasDouble = 0;
|
|
int gTestDouble = 1;
|
|
const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
|
|
const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
|
|
int gMinVectorSize = 0;
|
|
int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
|
|
static MTdata gMTdata;
|
|
|
|
#pragma mark -
|
|
#pragma mark Declarations
|
|
|
|
static int ParseArgs(int argc, const char **argv);
|
|
static void PrintUsage(void);
|
|
test_status InitCL(cl_device_id device);
|
|
static int GetTestCase(const char *name, Type *outType, Type *inType,
|
|
SaturationMode *sat, RoundingMode *round);
|
|
static int DoTest(cl_device_id device, Type outType, Type inType,
|
|
SaturationMode sat, RoundingMode round, MTdata d);
|
|
static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
|
|
RoundingMode round, int vectorSize,
|
|
cl_kernel *outKernel);
|
|
static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
|
|
size_t blockCount);
|
|
|
|
void *FlushToZero(void);
|
|
void UnFlushToZero(void *);
|
|
|
|
// Windows (since long double got deprecated) sets the x87 to 53-bit precision
|
|
// (that's x87 default state). This causes problems with the tests that
|
|
// convert long and ulong to float and double or otherwise deal with values
|
|
// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
|
|
static inline void Force64BitFPUPrecision(void)
|
|
{
|
|
#if __MINGW32__
|
|
// The usual method is to use _controlfp as follows:
|
|
// #include <float.h>
|
|
// _controlfp(_PC_64, _MCW_PC);
|
|
//
|
|
// _controlfp is available on MinGW32 but not on MinGW64. Instead of having
|
|
// divergent code just use inline assembly which works for both.
|
|
unsigned short int orig_cw = 0;
|
|
unsigned short int new_cw = 0;
|
|
__asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
|
|
new_cw = orig_cw | 0x0300; // set precision to 64-bit
|
|
__asm__ __volatile__("fldcw %0" ::"m"(new_cw));
|
|
#else
|
|
/* Implement for other platforms if needed */
|
|
#endif
|
|
}
|
|
|
|
int test_conversions(cl_device_id device, cl_context context,
|
|
cl_command_queue queue, int num_elements)
|
|
{
|
|
int error, i, testNumber = -1;
|
|
int startMinVectorSize = gMinVectorSize;
|
|
Type inType, outType;
|
|
RoundingMode round;
|
|
SaturationMode sat;
|
|
|
|
if (argCount)
|
|
{
|
|
for (i = 0; i < argCount; i++)
|
|
{
|
|
if (GetTestCase(argList[i], &outType, &inType, &sat, &round))
|
|
{
|
|
vlog_error("\n\t\t**** ERROR: Unable to parse function name "
|
|
"%s. Skipping.... *****\n\n",
|
|
argList[i]);
|
|
continue;
|
|
}
|
|
|
|
// skip double if we don't have it
|
|
if (!gTestDouble && (inType == kdouble || outType == kdouble))
|
|
{
|
|
if (gHasDouble)
|
|
{
|
|
vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
|
|
gTypeNames[outType], gSaturationNames[sat],
|
|
gRoundingModeNames[round], gTypeNames[inType]);
|
|
vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
|
|
"off.\n");
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// skip longs on embedded
|
|
if (!gHasLong
|
|
&& (inType == klong || outType == klong || inType == kulong
|
|
|| outType == kulong))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Skip the implicit converts if the rounding mode is not default or
|
|
// test is saturated
|
|
if (0 == startMinVectorSize)
|
|
{
|
|
if (sat || round != kDefaultRoundingMode)
|
|
gMinVectorSize = 1;
|
|
else
|
|
gMinVectorSize = 0;
|
|
}
|
|
|
|
if ((error = DoTest(device, outType, inType, sat, round, gMTdata)))
|
|
{
|
|
vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
|
|
gTypeNames[outType], gSaturationNames[sat],
|
|
gRoundingModeNames[round], gTypeNames[inType]);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (outType = (Type)0; outType < kTypeCount;
|
|
outType = (Type)(outType + 1))
|
|
{
|
|
for (inType = (Type)0; inType < kTypeCount;
|
|
inType = (Type)(inType + 1))
|
|
{
|
|
// skip longs on embedded
|
|
if (!gHasLong
|
|
&& (inType == klong || outType == klong || inType == kulong
|
|
|| outType == kulong))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
for (sat = (SaturationMode)0; sat < kSaturationModeCount;
|
|
sat = (SaturationMode)(sat + 1))
|
|
{
|
|
// skip illegal saturated conversions to float type
|
|
if (kSaturated == sat
|
|
&& (outType == kfloat || outType == kdouble))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
for (round = (RoundingMode)0; round < kRoundingModeCount;
|
|
round = (RoundingMode)(round + 1))
|
|
{
|
|
if (++testNumber < gStartTestNumber)
|
|
{
|
|
// vlog( "%d) skipping convert_%sn%s%s( %sn
|
|
// )\n", testNumber, gTypeNames[ outType ],
|
|
// gSaturationNames[ sat ],
|
|
// gRoundingModeNames[round], gTypeNames[inType]
|
|
// );
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
if (gEndTestNumber > 0
|
|
&& testNumber >= gEndTestNumber)
|
|
{
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
vlog("%d) Testing convert_%sn%s%s( %sn ):\n",
|
|
testNumber, gTypeNames[outType],
|
|
gSaturationNames[sat], gRoundingModeNames[round],
|
|
gTypeNames[inType]);
|
|
|
|
// skip double if we don't have it
|
|
if (!gTestDouble
|
|
&& (inType == kdouble || outType == kdouble))
|
|
{
|
|
if (gHasDouble)
|
|
{
|
|
vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
|
|
"FAILED ** \n",
|
|
testNumber, gTypeNames[outType],
|
|
gSaturationNames[sat],
|
|
gRoundingModeNames[round],
|
|
gTypeNames[inType]);
|
|
vlog("\t\tcl_khr_fp64 enabled, but double "
|
|
"testing turned off.\n");
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Skip the implicit converts if the rounding mode is
|
|
// not default or test is saturated
|
|
if (0 == startMinVectorSize)
|
|
{
|
|
if (sat || round != kDefaultRoundingMode)
|
|
gMinVectorSize = 1;
|
|
else
|
|
gMinVectorSize = 0;
|
|
}
|
|
|
|
if ((error = DoTest(device, outType, inType, sat, round,
|
|
gMTdata)))
|
|
{
|
|
vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
|
|
"FAILED ** \n",
|
|
testNumber, gTypeNames[outType],
|
|
gSaturationNames[sat],
|
|
gRoundingModeNames[round],
|
|
gTypeNames[inType]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
exit:
|
|
return gFailCount;
|
|
}
|
|
|
|
test_definition test_list[] = {
|
|
ADD_TEST(conversions),
|
|
};
|
|
|
|
const int test_num = ARRAY_SIZE(test_list);
|
|
|
|
#pragma mark -
|
|
|
|
int main(int argc, const char **argv)
|
|
{
|
|
int error;
|
|
cl_uint seed = (cl_uint)time(NULL);
|
|
|
|
argc = parseCustomParam(argc, argv);
|
|
if (argc == -1)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
if ((error = ParseArgs(argc, argv))) return error;
|
|
|
|
// Turn off sleep so our tests run to completion
|
|
PreventSleep();
|
|
atexit(ResumeSleep);
|
|
|
|
if (!gMultithread) SetThreadCount(1);
|
|
|
|
#if defined(_MSC_VER) && defined(_M_IX86)
|
|
// VS2005 (and probably others, since long double got deprecated) sets
|
|
// the x87 to 53-bit precision. This causes problems with the tests
|
|
// that convert long and ulong to float and double, since they deal
|
|
// with values that need more precision than that. So, set the x87
|
|
// to 64-bit precision.
|
|
unsigned int ignored;
|
|
_controlfp_s(&ignored, _PC_64, _MCW_PC);
|
|
#endif
|
|
|
|
vlog("===========================================================\n");
|
|
vlog("Random seed: %u\n", seed);
|
|
gMTdata = init_genrand(seed);
|
|
|
|
const char *arg[] = { argv[0] };
|
|
int ret =
|
|
runTestHarnessWithCheck(1, arg, test_num, test_list, true, 0, InitCL);
|
|
|
|
free_mtdata(gMTdata);
|
|
if (gQueue)
|
|
{
|
|
error = clFinish(gQueue);
|
|
if (error) vlog_error("clFinish failed: %d\n", error);
|
|
}
|
|
|
|
clReleaseMemObject(gInBuffer);
|
|
|
|
for (int i = 0; i < kCallStyleCount; i++)
|
|
{
|
|
clReleaseMemObject(gOutBuffers[i]);
|
|
}
|
|
clReleaseCommandQueue(gQueue);
|
|
clReleaseContext(gContext);
|
|
|
|
return ret;
|
|
}
|
|
|
|
#pragma mark -
|
|
#pragma mark setup
|
|
|
|
static int ParseArgs(int argc, const char **argv)
|
|
{
|
|
int i;
|
|
argList = (const char **)calloc(argc, sizeof(char *));
|
|
argCount = 0;
|
|
|
|
if (NULL == argList && argc > 1) return -1;
|
|
|
|
#if (defined(__APPLE__) || defined(__linux__) || defined(__MINGW32__))
|
|
{ // Extract the app name
|
|
char baseName[MAXPATHLEN];
|
|
strncpy(baseName, argv[0], MAXPATHLEN);
|
|
char *base = basename(baseName);
|
|
if (NULL != base)
|
|
{
|
|
strncpy(appName, base, sizeof(appName));
|
|
appName[sizeof(appName) - 1] = '\0';
|
|
}
|
|
}
|
|
#elif defined(_WIN32)
|
|
{
|
|
char fname[_MAX_FNAME + _MAX_EXT + 1];
|
|
char ext[_MAX_EXT];
|
|
|
|
errno_t err = _splitpath_s(argv[0], NULL, 0, NULL, 0, fname, _MAX_FNAME,
|
|
ext, _MAX_EXT);
|
|
if (err == 0)
|
|
{ // no error
|
|
strcat(fname, ext); // just cat them, size of frame can keep both
|
|
strncpy(appName, fname, sizeof(appName));
|
|
appName[sizeof(appName) - 1] = '\0';
|
|
}
|
|
}
|
|
#endif
|
|
|
|
vlog("\n%s", appName);
|
|
for (i = 1; i < argc; i++)
|
|
{
|
|
const char *arg = argv[i];
|
|
if (NULL == arg) break;
|
|
|
|
vlog("\t%s", arg);
|
|
if (arg[0] == '-')
|
|
{
|
|
arg++;
|
|
while (*arg != '\0')
|
|
{
|
|
switch (*arg)
|
|
{
|
|
case 'd': gTestDouble ^= 1; break;
|
|
case 'l': gSkipTesting ^= 1; break;
|
|
case 'm': gMultithread ^= 1; break;
|
|
case 'w': gWimpyMode ^= 1; break;
|
|
case '[':
|
|
parseWimpyReductionFactor(arg, gWimpyReductionFactor);
|
|
break;
|
|
case 'z': gForceFTZ ^= 1; break;
|
|
case 't': gTimeResults ^= 1; break;
|
|
case 'a': gReportAverageTimes ^= 1; break;
|
|
case '1':
|
|
if (arg[1] == '6')
|
|
{
|
|
gMinVectorSize = 6;
|
|
gMaxVectorSize = 7;
|
|
arg++;
|
|
}
|
|
else
|
|
{
|
|
gMinVectorSize = 0;
|
|
gMaxVectorSize = 2;
|
|
}
|
|
break;
|
|
|
|
case '2':
|
|
gMinVectorSize = 2;
|
|
gMaxVectorSize = 3;
|
|
break;
|
|
|
|
case '3':
|
|
gMinVectorSize = 3;
|
|
gMaxVectorSize = 4;
|
|
break;
|
|
|
|
case '4':
|
|
gMinVectorSize = 4;
|
|
gMaxVectorSize = 5;
|
|
break;
|
|
|
|
case '8':
|
|
gMinVectorSize = 5;
|
|
gMaxVectorSize = 6;
|
|
break;
|
|
|
|
default:
|
|
vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg);
|
|
PrintUsage();
|
|
return -1;
|
|
}
|
|
arg++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
char *t = NULL;
|
|
long number = strtol(arg, &t, 0);
|
|
if (t != arg)
|
|
{
|
|
if (gStartTestNumber != -1)
|
|
gEndTestNumber = gStartTestNumber + (int)number;
|
|
else
|
|
gStartTestNumber = (int)number;
|
|
}
|
|
else
|
|
{
|
|
argList[argCount] = arg;
|
|
argCount++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check for the wimpy mode environment variable
|
|
if (getenv("CL_WIMPY_MODE"))
|
|
{
|
|
vlog("\n");
|
|
vlog("*** Detected CL_WIMPY_MODE env ***\n");
|
|
gWimpyMode = 1;
|
|
}
|
|
|
|
vlog( "\n" );
|
|
|
|
PrintArch();
|
|
|
|
if (gWimpyMode)
|
|
{
|
|
vlog("\n");
|
|
vlog("*** WARNING: Testing in Wimpy mode! ***\n");
|
|
vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n");
|
|
vlog("*** It gives warm fuzzy feelings and then nevers calls. ***\n\n");
|
|
vlog("*** Wimpy Reduction Factor: %-27u ***\n\n",
|
|
gWimpyReductionFactor);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void PrintUsage(void)
|
|
{
|
|
int i;
|
|
vlog("%s [-wz#]: <optional: test names>\n", appName);
|
|
vlog("\ttest names:\n");
|
|
vlog("\t\tdestFormat<_sat><_round>_sourceFormat\n");
|
|
vlog("\t\t\tPossible format types are:\n\t\t\t\t");
|
|
for (i = 0; i < kTypeCount; i++) vlog("%s, ", gTypeNames[i]);
|
|
vlog("\n\n\t\t\tPossible saturation values are: (empty) and _sat\n");
|
|
vlog("\t\t\tPossible rounding values are:\n\t\t\t\t(empty), ");
|
|
for (i = 1; i < kRoundingModeCount; i++)
|
|
vlog("%s, ", gRoundingModeNames[i]);
|
|
vlog("\n\t\t\tExamples:\n");
|
|
vlog("\t\t\t\tulong_short converts short to ulong\n");
|
|
vlog("\t\t\t\tchar_sat_rte_float converts float to char with saturated "
|
|
"clipping in round to nearest rounding mode\n\n");
|
|
vlog("\toptions:\n");
|
|
vlog("\t\t-d\tToggle testing of double precision. On by default if "
|
|
"cl_khr_fp64 is enabled, ignored otherwise.\n");
|
|
vlog("\t\t-l\tToggle link check mode. When on, testing is skipped, and we "
|
|
"just check to see that the kernels build. (Off by default.)\n");
|
|
vlog("\t\t-m\tToggle Multithreading. (On by default.)\n");
|
|
vlog("\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very "
|
|
"small subset of the tests for each fn. NOT A VALID TEST! (Off by "
|
|
"default.)\n");
|
|
vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is "
|
|
"1-12, default factor(%u)\n",
|
|
gWimpyReductionFactor);
|
|
vlog("\t\t-z\tToggle flush to zero mode (Default: per device)\n");
|
|
vlog("\t\t-#\tTest just vector size given by #, where # is an element of "
|
|
"the set {1,2,3,4,8,16}\n");
|
|
vlog("\n");
|
|
vlog(
|
|
"You may also pass the number of the test on which to start.\nA second "
|
|
"number can be then passed to indicate how many tests to run\n\n");
|
|
}
|
|
|
|
|
|
static int GetTestCase(const char *name, Type *outType, Type *inType,
|
|
SaturationMode *sat, RoundingMode *round)
|
|
{
|
|
int i;
|
|
|
|
// Find the return type
|
|
for (i = 0; i < kTypeCount; i++)
|
|
if (name == strstr(name, gTypeNames[i]))
|
|
{
|
|
*outType = (Type)i;
|
|
name += strlen(gTypeNames[i]);
|
|
|
|
break;
|
|
}
|
|
|
|
if (i == kTypeCount) return -1;
|
|
|
|
// Check to see if _sat appears next
|
|
*sat = (SaturationMode)0;
|
|
for (i = 1; i < kSaturationModeCount; i++)
|
|
if (name == strstr(name, gSaturationNames[i]))
|
|
{
|
|
*sat = (SaturationMode)i;
|
|
name += strlen(gSaturationNames[i]);
|
|
break;
|
|
}
|
|
|
|
*round = (RoundingMode)0;
|
|
for (i = 1; i < kRoundingModeCount; i++)
|
|
if (name == strstr(name, gRoundingModeNames[i]))
|
|
{
|
|
*round = (RoundingMode)i;
|
|
name += strlen(gRoundingModeNames[i]);
|
|
break;
|
|
}
|
|
|
|
if (*name != '_') return -2;
|
|
name++;
|
|
|
|
for (i = 0; i < kTypeCount; i++)
|
|
if (name == strstr(name, gTypeNames[i]))
|
|
{
|
|
*inType = (Type)i;
|
|
name += strlen(gTypeNames[i]);
|
|
|
|
break;
|
|
}
|
|
|
|
if (i == kTypeCount) return -3;
|
|
|
|
if (*name != '\0') return -4;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#pragma mark -
|
|
#pragma mark OpenCL
|
|
|
|
test_status InitCL(cl_device_id device)
|
|
{
|
|
int error, i;
|
|
size_t configSize = sizeof(gComputeDevices);
|
|
|
|
if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
|
|
configSize, &gComputeDevices, NULL)))
|
|
gComputeDevices = 1;
|
|
|
|
configSize = sizeof(gDeviceFrequency);
|
|
if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
|
|
configSize, &gDeviceFrequency, NULL)))
|
|
gDeviceFrequency = 0;
|
|
|
|
cl_device_fp_config floatCapabilities = 0;
|
|
if ((error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG,
|
|
sizeof(floatCapabilities), &floatCapabilities,
|
|
NULL)))
|
|
floatCapabilities = 0;
|
|
if (0 == (CL_FP_DENORM & floatCapabilities)) gForceFTZ ^= 1;
|
|
|
|
if (0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST))
|
|
{
|
|
char profileStr[128] = "";
|
|
// Verify that we are an embedded profile device
|
|
if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE,
|
|
sizeof(profileStr), profileStr, NULL)))
|
|
{
|
|
vlog_error("FAILURE: Could not get device profile: error %d\n",
|
|
error);
|
|
return TEST_FAIL;
|
|
}
|
|
|
|
if (strcmp(profileStr, "EMBEDDED_PROFILE"))
|
|
{
|
|
vlog_error("FAILURE: non-embedded profile device does not support "
|
|
"CL_FP_ROUND_TO_NEAREST\n");
|
|
return TEST_FAIL;
|
|
}
|
|
|
|
if (0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO))
|
|
{
|
|
vlog_error("FAILURE: embedded profile device supports neither "
|
|
"CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n");
|
|
return TEST_FAIL;
|
|
}
|
|
|
|
gIsRTZ = 1;
|
|
}
|
|
|
|
else if (is_extension_available(device, "cl_khr_fp64"))
|
|
{
|
|
gHasDouble = 1;
|
|
}
|
|
gTestDouble &= gHasDouble;
|
|
|
|
// detect whether profile of the device is embedded
|
|
char profile[1024] = "";
|
|
if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
|
|
profile, NULL)))
|
|
{
|
|
}
|
|
else if (strstr(profile, "EMBEDDED_PROFILE"))
|
|
{
|
|
gIsEmbedded = 1;
|
|
if (!is_extension_available(device, "cles_khr_int64")) gHasLong = 0;
|
|
}
|
|
|
|
|
|
gContext = clCreateContext(NULL, 1, &device, notify_callback, NULL, &error);
|
|
if (NULL == gContext || error)
|
|
{
|
|
vlog_error("clCreateContext failed. (%d)\n", error);
|
|
return TEST_FAIL;
|
|
}
|
|
|
|
gQueue = clCreateCommandQueue(gContext, device, 0, &error);
|
|
if (NULL == gQueue || error)
|
|
{
|
|
vlog_error("clCreateCommandQueue failed. (%d)\n", error);
|
|
return TEST_FAIL;
|
|
}
|
|
|
|
// Allocate buffers
|
|
// FIXME: use clProtectedArray for guarded allocations?
|
|
gIn = malloc(BUFFER_SIZE + 2 * kPageSize);
|
|
gAllowZ = malloc(BUFFER_SIZE + 2 * kPageSize);
|
|
gRef = malloc(BUFFER_SIZE + 2 * kPageSize);
|
|
for (i = 0; i < kCallStyleCount; i++)
|
|
{
|
|
gOut[i] = malloc(BUFFER_SIZE + 2 * kPageSize);
|
|
if (NULL == gOut[i]) return TEST_FAIL;
|
|
}
|
|
|
|
// setup input buffers
|
|
gInBuffer =
|
|
clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
|
|
BUFFER_SIZE, NULL, &error);
|
|
if (gInBuffer == NULL || error)
|
|
{
|
|
vlog_error("clCreateBuffer failed for input (%d)\n", error);
|
|
return TEST_FAIL;
|
|
}
|
|
|
|
// setup output buffers
|
|
for (i = 0; i < kCallStyleCount; i++)
|
|
{
|
|
gOutBuffers[i] =
|
|
clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
|
|
BUFFER_SIZE, NULL, &error);
|
|
if (gOutBuffers[i] == NULL || error)
|
|
{
|
|
vlog_error("clCreateArray failed for output (%d)\n", error);
|
|
return TEST_FAIL;
|
|
}
|
|
}
|
|
|
|
|
|
gMTdata = init_genrand(gRandomSeed);
|
|
|
|
|
|
char c[1024];
|
|
static const char *no_yes[] = { "NO", "YES" };
|
|
vlog("\nCompute Device info:\n");
|
|
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL);
|
|
vlog("\tDevice Name: %s\n", c);
|
|
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
|
|
vlog("\tVendor: %s\n", c);
|
|
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL);
|
|
vlog("\tDevice Version: %s\n", c);
|
|
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
|
|
vlog("\tCL C Version: %s\n", c);
|
|
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL);
|
|
vlog("\tDriver Version: %s\n", c);
|
|
vlog("\tProcessing with %ld devices\n", gComputeDevices);
|
|
vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency);
|
|
vlog("\tSubnormal values supported for floats? %s\n",
|
|
no_yes[0 != (CL_FP_DENORM & floatCapabilities)]);
|
|
vlog("\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ]);
|
|
vlog("\tTesting with default RTZ mode for floats? %s\n",
|
|
no_yes[0 != gIsRTZ]);
|
|
vlog("\tHas Double? %s\n", no_yes[0 != gHasDouble]);
|
|
if (gHasDouble) vlog("\tTest Double? %s\n", no_yes[0 != gTestDouble]);
|
|
vlog("\tHas Long? %s\n", no_yes[0 != gHasLong]);
|
|
vlog("\tTesting vector sizes: ");
|
|
for (i = gMinVectorSize; i < gMaxVectorSize; i++)
|
|
vlog("\t%d", vectorSizes[i]);
|
|
vlog("\n");
|
|
return TEST_PASS;
|
|
}
|
|
|
|
static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
|
|
size_t blockCount)
|
|
{
|
|
// The global dimensions are just the blockCount to execute since we haven't
|
|
// set up multiple queues for multiple devices.
|
|
int error;
|
|
|
|
error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
|
|
error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
|
|
|
|
if (error)
|
|
{
|
|
vlog_error("FAILED -- could not set kernel args (%d)\n", error);
|
|
return error;
|
|
}
|
|
|
|
if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
|
|
NULL, 0, NULL, NULL)))
|
|
{
|
|
vlog_error("FAILED -- could not execute kernel (%d)\n", error);
|
|
return error;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if !defined(__APPLE__)
|
|
void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
|
|
#endif
|
|
|
|
#if defined(__APPLE__)
|
|
#include <mach/mach_time.h>
|
|
#endif
|
|
|
|
uint64_t GetTime(void);
|
|
uint64_t GetTime(void)
|
|
{
|
|
#if defined(__APPLE__)
|
|
return mach_absolute_time();
|
|
#elif defined(_MSC_VER)
|
|
return ReadTime();
|
|
#else
|
|
// mach_absolute_time is a high precision timer with precision < 1
|
|
// microsecond.
|
|
#warning need accurate clock here. Times are invalid.
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
|
|
#if defined(_MSC_VER)
|
|
/* function is defined in "compat.h" */
|
|
#else
|
|
double SubtractTime(uint64_t endTime, uint64_t startTime);
|
|
double SubtractTime(uint64_t endTime, uint64_t startTime)
|
|
{
|
|
uint64_t diff = endTime - startTime;
|
|
static double conversion = 0.0;
|
|
|
|
if (0.0 == conversion)
|
|
{
|
|
#if defined(__APPLE__)
|
|
mach_timebase_info_data_t info = { 0, 0 };
|
|
kern_return_t err = mach_timebase_info(&info);
|
|
if (0 == err)
|
|
conversion = 1e-9 * (double)info.numer / (double)info.denom;
|
|
#else
|
|
// This function consumes output from GetTime() above, and converts the
|
|
// time to secionds.
|
|
#warning need accurate ticks to seconds conversion factor here. Times are invalid.
|
|
#endif
|
|
}
|
|
|
|
// strictly speaking we should also be subtracting out timer latency here
|
|
return conversion * (double)diff;
|
|
}
|
|
#endif
|
|
|
|
typedef struct CalcReferenceValuesInfo
|
|
{
|
|
struct WriteInputBufferInfo
|
|
*parent; // pointer back to the parent WriteInputBufferInfo struct
|
|
cl_kernel kernel; // the kernel for this vector size
|
|
cl_program program; // the program for this vector size
|
|
cl_uint vectorSize; // the vector size for this callback chain
|
|
void *p; // the pointer to mapped result data for this vector size
|
|
cl_int result;
|
|
} CalcReferenceValuesInfo;
|
|
|
|
typedef struct WriteInputBufferInfo
|
|
{
|
|
volatile cl_event
|
|
calcReferenceValues; // user event which signals when main thread is
|
|
// done calculating reference values
|
|
volatile cl_event
|
|
doneBarrier; // user event which signals when worker threads are done
|
|
cl_uint count; // the number of elements in the array
|
|
Type outType; // the data type of the conversion result
|
|
Type inType; // the data type of the conversion input
|
|
volatile int barrierCount;
|
|
CalcReferenceValuesInfo calcInfo[kCallStyleCount];
|
|
} WriteInputBufferInfo;
|
|
|
|
cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
|
|
cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
|
|
{
|
|
if (0 == (x & (x - 1))) return x;
|
|
|
|
while (x & (x - 1)) x &= x - 1;
|
|
|
|
return x + x;
|
|
}
|
|
|
|
void WriteInputBufferComplete(void *);
|
|
|
|
typedef struct DataInitInfo
|
|
{
|
|
cl_ulong start;
|
|
cl_uint size;
|
|
Type outType;
|
|
Type inType;
|
|
SaturationMode sat;
|
|
RoundingMode round;
|
|
MTdata *d;
|
|
} DataInitInfo;
|
|
|
|
cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
|
|
cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
|
|
{
|
|
DataInitInfo *info = (DataInitInfo *)p;
|
|
|
|
gInitFunctions[info->inType](
|
|
(char *)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat,
|
|
info->round, info->outType, info->start + job_id * info->size,
|
|
info->size, info->d[thread_id]);
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
|
|
{
|
|
cl_uint i;
|
|
for (i = 0; i < count; ++i)
|
|
allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
|
|
}
|
|
|
|
cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
|
|
cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
|
|
{
|
|
DataInitInfo *info = (DataInitInfo *)p;
|
|
cl_uint count = info->size;
|
|
Type inType = info->inType;
|
|
Type outType = info->outType;
|
|
RoundingMode round = info->round;
|
|
size_t j;
|
|
|
|
Force64BitFPUPrecision();
|
|
|
|
void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
|
|
void *a = (cl_uchar *)gAllowZ + job_id * count;
|
|
void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];
|
|
|
|
if (outType != inType)
|
|
{
|
|
// create the reference while we wait
|
|
Convert f = gConversions[outType][inType];
|
|
if (info->sat) f = gSaturatedConversions[outType][inType];
|
|
|
|
#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
|
|
/* ARM VFP doesn't have hardware instruction for converting from 64-bit
|
|
* integer to float types, hence GCC ARM uses the floating-point
|
|
* emulation code despite which -mfloat-abi setting it is. But the
|
|
* emulation code in libgcc.a has only one rounding mode (round to
|
|
* nearest even in this case) and ignores the user rounding mode setting
|
|
* in hardware. As a result setting rounding modes in hardware won't
|
|
* give correct rounding results for type covert from 64-bit integer to
|
|
* float using GCC for ARM compiler so for testing different rounding
|
|
* modes, we need to use alternative reference function. ARM64 does have
|
|
* an instruction, however we cannot guarantee the compiler will use it.
|
|
* On all ARM architechures use emulation to calculate reference.*/
|
|
switch (round)
|
|
{
|
|
/* conversions to floating-point type use the current rounding mode.
|
|
* The only default floating-point rounding mode supported is round
|
|
* to nearest even i.e the current rounding mode will be _rte for
|
|
* floating-point types. */
|
|
case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
|
|
case kRoundToNearestEven: qcom_rm = qcomRTE; break;
|
|
case kRoundUp: qcom_rm = qcomRTP; break;
|
|
case kRoundDown: qcom_rm = qcomRTN; break;
|
|
case kRoundTowardZero: qcom_rm = qcomRTZ; break;
|
|
default:
|
|
vlog_error("ERROR: undefined rounding mode %d\n", round);
|
|
break;
|
|
}
|
|
qcom_sat = info->sat;
|
|
#endif
|
|
|
|
RoundingMode oldRound = set_round(round, outType);
|
|
f(d, s, count);
|
|
set_round(oldRound, outType);
|
|
|
|
// Decide if we allow a zero result in addition to the correctly rounded
|
|
// one
|
|
memset(a, 0, count);
|
|
if (gForceFTZ)
|
|
{
|
|
if (inType == kfloat) setAllowZ((uint8_t *)a, (uint32_t *)s, count);
|
|
if (outType == kfloat)
|
|
setAllowZ((uint8_t *)a, (uint32_t *)d, count);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Copy the input to the reference
|
|
memcpy(d, s, info->size * gTypeSizes[inType]);
|
|
}
|
|
|
|
// Patch up NaNs conversions to integer to zero -- these can be converted to
|
|
// any integer
|
|
if (info->outType != kfloat && info->outType != kdouble)
|
|
{
|
|
if (inType == kfloat)
|
|
{
|
|
float *inp = (float *)s;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j]))
|
|
memset((char *)d + j * gTypeSizes[outType], 0,
|
|
gTypeSizes[outType]);
|
|
}
|
|
}
|
|
if (inType == kdouble)
|
|
{
|
|
double *inp = (double *)s;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j]))
|
|
memset((char *)d + j * gTypeSizes[outType], 0,
|
|
gTypeSizes[outType]);
|
|
}
|
|
}
|
|
}
|
|
else if (inType == kfloat || inType == kdouble)
|
|
{ // outtype and intype is float or double. NaN conversions for float <->
|
|
// double can be any NaN
|
|
if (inType == kfloat && outType == kdouble)
|
|
{
|
|
float *inp = (float *)s;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j])) ((double *)d)[j] = NAN;
|
|
}
|
|
}
|
|
if (inType == kdouble && outType == kfloat)
|
|
{
|
|
double *inp = (double *)s;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j])) ((float *)d)[j] = NAN;
|
|
}
|
|
}
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
static int DoTest(cl_device_id device, Type outType, Type inType,
|
|
SaturationMode sat, RoundingMode round, MTdata d)
|
|
{
|
|
#ifdef __APPLE__
|
|
cl_ulong wall_start = mach_absolute_time();
|
|
#endif
|
|
|
|
DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
|
|
WriteInputBufferInfo writeInputBufferInfo;
|
|
int vectorSize;
|
|
int error = 0;
|
|
cl_uint threads = GetThreadCount();
|
|
uint64_t i;
|
|
|
|
gTestCount++;
|
|
size_t blockCount =
|
|
BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
|
|
size_t step = blockCount;
|
|
uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);
|
|
|
|
memset(&writeInputBufferInfo, 0, sizeof(writeInputBufferInfo));
|
|
init_info.d = (MTdata *)malloc(threads * sizeof(MTdata));
|
|
if (NULL == init_info.d)
|
|
{
|
|
vlog_error(
|
|
"ERROR: Unable to allocate storage for random number generator!\n");
|
|
return -1;
|
|
}
|
|
for (i = 0; i < threads; i++)
|
|
{
|
|
init_info.d[i] = init_genrand(genrand_int32(d));
|
|
if (NULL == init_info.d[i])
|
|
{
|
|
vlog_error("ERROR: Unable to allocate storage for random number "
|
|
"generator!\n");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
writeInputBufferInfo.outType = outType;
|
|
writeInputBufferInfo.inType = inType;
|
|
|
|
for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
writeInputBufferInfo.calcInfo[vectorSize].program =
|
|
MakeProgram(outType, inType, sat, round, vectorSize,
|
|
&writeInputBufferInfo.calcInfo[vectorSize].kernel);
|
|
if (NULL == writeInputBufferInfo.calcInfo[vectorSize].program)
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
if (NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel)
|
|
{
|
|
gFailCount++;
|
|
vlog_error("\t\tFAILED -- Failed to create kernel.\n");
|
|
return -2;
|
|
}
|
|
|
|
writeInputBufferInfo.calcInfo[vectorSize].parent =
|
|
&writeInputBufferInfo;
|
|
writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
|
|
writeInputBufferInfo.calcInfo[vectorSize].result = -1;
|
|
}
|
|
|
|
if (gSkipTesting) goto exit;
|
|
|
|
// Patch up rounding mode if default is RTZ
|
|
// We leave the part above in default rounding mode so that the right kernel
|
|
// is compiled.
|
|
if (round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat))
|
|
init_info.round = round = kRoundTowardZero;
|
|
|
|
// Figure out how many elements are in a work block
|
|
|
|
// we handle 64-bit types a bit differently.
|
|
if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;
|
|
|
|
if (!gWimpyMode && gIsEmbedded)
|
|
step = blockCount * EMBEDDED_REDUCTION_FACTOR;
|
|
|
|
if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
|
|
vlog("Testing... ");
|
|
fflush(stdout);
|
|
for (i = 0; i < (uint64_t)lastCase; i += step)
|
|
{
|
|
|
|
if (0 == (i & ((lastCase >> 3) - 1)))
|
|
{
|
|
vlog(".");
|
|
fflush(stdout);
|
|
}
|
|
|
|
cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
|
|
writeInputBufferInfo.count = count;
|
|
|
|
// Crate a user event to represent the status of the reference value
|
|
// computation completion
|
|
writeInputBufferInfo.calcReferenceValues =
|
|
clCreateUserEvent(gContext, &error);
|
|
if (error || NULL == writeInputBufferInfo.calcReferenceValues)
|
|
{
|
|
vlog_error("ERROR: Unable to create user event. (%d)\n", error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// retain for consumption by MapOutputBufferComplete
|
|
for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
|
|
vectorSize++)
|
|
{
|
|
if ((error =
|
|
clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
|
|
{
|
|
vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
// Crate a user event to represent when the callbacks are done verifying
|
|
// correctness
|
|
writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
|
|
if (error || NULL == writeInputBufferInfo.calcReferenceValues)
|
|
{
|
|
vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
|
|
error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// retain for use by the callback that calls this
|
|
if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
|
|
{
|
|
vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
|
|
error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Call this in a multithreaded manner
|
|
// gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d
|
|
// );
|
|
cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
|
|
init_info.start = i;
|
|
init_info.size = count / chunks;
|
|
if (init_info.size < 16384)
|
|
{
|
|
chunks = RoundUpToNextPowerOfTwo(threads);
|
|
init_info.size = count / chunks;
|
|
if (init_info.size < 16384)
|
|
{
|
|
init_info.size = count;
|
|
chunks = 1;
|
|
}
|
|
}
|
|
ThreadPool_Do(InitData, chunks, &init_info);
|
|
|
|
// Copy the results to the device
|
|
if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
|
|
count * gTypeSizes[inType], gIn, 0,
|
|
NULL, NULL)))
|
|
{
|
|
vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Call completion callback for the write, which will enqueue the rest
|
|
// of the work.
|
|
WriteInputBufferComplete((void *)&writeInputBufferInfo);
|
|
|
|
// Make sure the work is actually running, so we don't deadlock
|
|
if ((error = clFlush(gQueue)))
|
|
{
|
|
vlog_error("clFlush failed with error %d\n", error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
ThreadPool_Do(PrepareReference, chunks, &init_info);
|
|
|
|
// signal we are done calculating the reference results
|
|
if ((error = clSetUserEventStatus(
|
|
writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
|
|
{
|
|
vlog_error(
|
|
"Error: Failed to set user event status to CL_COMPLETE: %d\n",
|
|
error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Wait for the event callbacks to finish verifying correctness.
|
|
if ((error = clWaitForEvents(
|
|
1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
|
|
{
|
|
vlog_error("Error: Failed to wait for barrier: %d\n", error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
|
|
{
|
|
vlog_error("Error: Failed to release calcReferenceValues: %d\n",
|
|
error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
|
|
{
|
|
vlog_error("Error: Failed to release done barrier: %d\n", error);
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
|
|
for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
|
|
vectorSize++)
|
|
{
|
|
if ((error = writeInputBufferInfo.calcInfo[vectorSize].result))
|
|
{
|
|
switch (inType)
|
|
{
|
|
case kuchar:
|
|
case kchar:
|
|
vlog("Input value: 0x%2.2x ",
|
|
((unsigned char *)gIn)[error - 1]);
|
|
break;
|
|
case kushort:
|
|
case kshort:
|
|
vlog("Input value: 0x%4.4x ",
|
|
((unsigned short *)gIn)[error - 1]);
|
|
break;
|
|
case kuint:
|
|
case kint:
|
|
vlog("Input value: 0x%8.8x ",
|
|
((unsigned int *)gIn)[error - 1]);
|
|
break;
|
|
case kfloat:
|
|
vlog("Input value: %a ", ((float *)gIn)[error - 1]);
|
|
break;
|
|
break;
|
|
case kulong:
|
|
case klong:
|
|
vlog("Input value: 0x%16.16llx ",
|
|
((unsigned long long *)gIn)[error - 1]);
|
|
break;
|
|
case kdouble:
|
|
vlog("Input value: %a ", ((double *)gIn)[error - 1]);
|
|
break;
|
|
default:
|
|
vlog_error("Internal error at %s: %d\n", __FILE__,
|
|
__LINE__);
|
|
abort();
|
|
break;
|
|
}
|
|
|
|
// tell the user which conversion it was.
|
|
if (0 == vectorSize)
|
|
vlog(" (implicit scalar conversion from %s to %s)\n",
|
|
gTypeNames[inType], gTypeNames[outType]);
|
|
else
|
|
vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
|
|
sizeNames[vectorSize], gSaturationNames[sat],
|
|
gRoundingModeNames[round], gTypeNames[inType],
|
|
sizeNames[vectorSize]);
|
|
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
}
|
|
|
|
log_info("done.\n");
|
|
|
|
if (gTimeResults)
|
|
{
|
|
// Kick off tests for the various vector lengths
|
|
for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
|
|
vectorSize++)
|
|
{
|
|
size_t workItemCount = blockCount / vectorSizes[vectorSize];
|
|
if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
|
|
workItemCount /=
|
|
4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
|
|
|
|
double sum = 0.0;
|
|
double bestTime = INFINITY;
|
|
cl_uint k;
|
|
for (k = 0; k < PERF_LOOP_COUNT; k++)
|
|
{
|
|
uint64_t startTime = GetTime();
|
|
if ((error = RunKernel(
|
|
writeInputBufferInfo.calcInfo[vectorSize].kernel,
|
|
gInBuffer, gOutBuffers[vectorSize], workItemCount)))
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Make sure OpenCL is done
|
|
if ((error = clFinish(gQueue)))
|
|
{
|
|
vlog_error("Error %d at clFinish\n", error);
|
|
goto exit;
|
|
}
|
|
|
|
uint64_t endTime = GetTime();
|
|
double time = SubtractTime(endTime, startTime);
|
|
sum += time;
|
|
if (time < bestTime) bestTime = time;
|
|
}
|
|
|
|
if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
|
|
double clocksPerOp = bestTime * (double)gDeviceFrequency
|
|
* gComputeDevices * gSimdSize * 1e6
|
|
/ (workItemCount * vectorSizes[vectorSize]);
|
|
if (0 == vectorSize)
|
|
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
|
|
"implicit convert %s -> %s", gTypeNames[inType],
|
|
gTypeNames[outType]);
|
|
else
|
|
vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
|
|
"convert_%s%s%s%s( %s%s )", gTypeNames[outType],
|
|
sizeNames[vectorSize], gSaturationNames[sat],
|
|
gRoundingModeNames[round], gTypeNames[inType],
|
|
sizeNames[vectorSize]);
|
|
}
|
|
}
|
|
|
|
if (gWimpyMode)
|
|
vlog("\tWimp pass");
|
|
else
|
|
vlog("\tpassed");
|
|
|
|
#ifdef __APPLE__
|
|
// record the run time
|
|
vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
|
|
#endif
|
|
vlog("\n\n");
|
|
fflush(stdout);
|
|
|
|
|
|
exit:
|
|
// clean up
|
|
for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
clReleaseProgram(writeInputBufferInfo.calcInfo[vectorSize].program);
|
|
clReleaseKernel(writeInputBufferInfo.calcInfo[vectorSize].kernel);
|
|
}
|
|
|
|
if (init_info.d)
|
|
{
|
|
for (i = 0; i < threads; i++) free_mtdata(init_info.d[i]);
|
|
free(init_info.d);
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
void MapResultValuesComplete(void *data);
|
|
|
|
// Note: not called reentrantly
|
|
void WriteInputBufferComplete(void *data)
|
|
{
|
|
cl_int status;
|
|
WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
|
|
cl_uint count = info->count;
|
|
int vectorSize;
|
|
|
|
info->barrierCount = gMaxVectorSize - gMinVectorSize;
|
|
|
|
// now that we know that the write buffer is complete, enqueue callbacks to
|
|
// wait for the main thread to finish calculating the reference results.
|
|
for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
size_t workItemCount =
|
|
(count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);
|
|
|
|
if ((status = RunKernel(info->calcInfo[vectorSize].kernel, gInBuffer,
|
|
gOutBuffers[vectorSize], workItemCount)))
|
|
{
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
info->calcInfo[vectorSize].p = clEnqueueMapBuffer(
|
|
gQueue, gOutBuffers[vectorSize], CL_TRUE,
|
|
CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
|
|
NULL, NULL, &status);
|
|
{
|
|
if (status)
|
|
{
|
|
vlog_error("ERROR: WriteInputBufferComplete calback failed "
|
|
"with status: %d\n",
|
|
status);
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
MapResultValuesComplete(info->calcInfo + vectorSize);
|
|
}
|
|
|
|
// Make sure the work starts moving -- otherwise we may deadlock
|
|
if ((status = clFlush(gQueue)))
|
|
{
|
|
vlog_error(
|
|
"ERROR: WriteInputBufferComplete calback failed with status: %d\n",
|
|
status);
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
// e was already released by the main thread. It should be destroyed
|
|
// automatically soon after we exit.
|
|
}
|
|
|
|
void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
|
|
void *data);
|
|
|
|
// Note: May be called reentrantly
|
|
void MapResultValuesComplete(void *data)
|
|
{
|
|
cl_int status;
|
|
CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
|
|
cl_event calcReferenceValues = info->parent->calcReferenceValues;
|
|
|
|
// we know that the map is done, wait for the main thread to finish
|
|
// calculating the reference values
|
|
if ((status = clSetEventCallback(calcReferenceValues, CL_COMPLETE,
|
|
CalcReferenceValuesComplete, data)))
|
|
{
|
|
vlog_error("ERROR: clSetEventCallback failed in "
|
|
"MapResultValuesComplete with status: %d\n",
|
|
status);
|
|
gFailCount++; // not thread safe -- being lazy here
|
|
}
|
|
|
|
// this thread no longer needs its reference to info->calcReferenceValues,
|
|
// so release it
|
|
if ((status = clReleaseEvent(calcReferenceValues)))
|
|
{
|
|
vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
|
|
"with status: %d\n",
|
|
status);
|
|
gFailCount++; // not thread safe -- being lazy here
|
|
}
|
|
|
|
// no need to flush since we didn't enqueue anything
|
|
|
|
// e was already released by WriteInputBufferComplete. It should be
|
|
// destroyed automatically soon after we exit.
|
|
}
|
|
|
|
|
|
void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
|
|
void *data)
|
|
{
|
|
CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
|
|
cl_uint vectorSize = info->vectorSize;
|
|
cl_uint count = info->parent->count;
|
|
Type outType =
|
|
info->parent->outType; // the data type of the conversion result
|
|
Type inType = info->parent->inType; // the data type of the conversion input
|
|
size_t j;
|
|
cl_int error;
|
|
cl_event doneBarrier = info->parent->doneBarrier;
|
|
|
|
// report spurious error condition
|
|
if (CL_SUCCESS != status)
|
|
{
|
|
vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
|
|
status);
|
|
gFailCount++; // lazy about thread safety here
|
|
return;
|
|
}
|
|
|
|
// Now we know that both results have been mapped back from the device, and
|
|
// the main thread is done calculating the reference results. It is now time
|
|
// to check the results.
|
|
|
|
// verify results
|
|
void *mapped = info->p;
|
|
|
|
// Patch up NaNs conversions to integer to zero -- these can be converted to
|
|
// any integer
|
|
if (outType != kfloat && outType != kdouble)
|
|
{
|
|
if (inType == kfloat)
|
|
{
|
|
float *inp = (float *)gIn;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j]))
|
|
memset((char *)mapped + j * gTypeSizes[outType], 0,
|
|
gTypeSizes[outType]);
|
|
}
|
|
}
|
|
if (inType == kdouble)
|
|
{
|
|
double *inp = (double *)gIn;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j]))
|
|
memset((char *)mapped + j * gTypeSizes[outType], 0,
|
|
gTypeSizes[outType]);
|
|
}
|
|
}
|
|
}
|
|
else if (inType == kfloat || inType == kdouble)
|
|
{ // outtype and intype is float or double. NaN conversions for float <->
|
|
// double can be any NaN
|
|
if (inType == kfloat && outType == kdouble)
|
|
{
|
|
float *inp = (float *)gIn;
|
|
double *outp = (double *)mapped;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
|
|
}
|
|
}
|
|
if (inType == kdouble && outType == kfloat)
|
|
{
|
|
double *inp = (double *)gIn;
|
|
float *outp = (float *)mapped;
|
|
for (j = 0; j < count; j++)
|
|
{
|
|
if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
|
|
info->result = gCheckResults[outType](mapped, gRef, gAllowZ, count,
|
|
vectorSizes[vectorSize]);
|
|
else
|
|
info->result = 0;
|
|
|
|
// Fill the output buffer with junk and release it
|
|
{
|
|
cl_uint pattern = 0xffffdead;
|
|
memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
|
|
if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
|
|
mapped, 0, NULL, NULL)))
|
|
{
|
|
vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
|
|
"CalcReferenceValuesComplete (%d)\n",
|
|
error);
|
|
gFailCount++;
|
|
}
|
|
}
|
|
|
|
if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
|
|
{
|
|
if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
|
|
{
|
|
vlog_error("ERROR: clSetUserEventStatus failed in "
|
|
"CalcReferenceValuesComplete (err: %d). We're probably "
|
|
"going to deadlock.\n",
|
|
status);
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
if ((status = clReleaseEvent(doneBarrier)))
|
|
{
|
|
vlog_error("ERROR: clReleaseEvent failed in "
|
|
"CalcReferenceValuesComplete (err: %d).\n",
|
|
status);
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
}
|
|
// e was already released by WriteInputBufferComplete. It should be
|
|
// destroyed automatically soon after all the calls to
|
|
// CalcReferenceValuesComplete exit.
|
|
}
|
|
|
|
static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
|
|
RoundingMode round, int vectorSize,
|
|
cl_kernel *outKernel)
|
|
{
|
|
cl_program program;
|
|
char testName[256];
|
|
int error = 0;
|
|
|
|
std::ostringstream source;
|
|
if (outType == kdouble || inType == kdouble)
|
|
source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
|
|
|
|
// Create the program. This is a bit complicated because we are trying to
|
|
// avoid byte and short stores.
|
|
if (0 == vectorSize)
|
|
{
|
|
// Create the type names.
|
|
char inName[32];
|
|
char outName[32];
|
|
strncpy(inName, gTypeNames[inType], sizeof(inName));
|
|
strncpy(outName, gTypeNames[outType], sizeof(outName));
|
|
sprintf(testName, "test_implicit_%s_%s", outName, inName);
|
|
|
|
source << "__kernel void " << testName << "( __global " << inName
|
|
<< " *src, __global " << outName << " *dest )\n";
|
|
source << "{\n";
|
|
source << " size_t i = get_global_id(0);\n";
|
|
source << " dest[i] = src[i];\n";
|
|
source << "}\n";
|
|
|
|
vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
|
|
gTypeNames[outType]);
|
|
fflush(stdout);
|
|
}
|
|
else
|
|
{
|
|
int vectorSizetmp = vectorSizes[vectorSize];
|
|
|
|
// Create the type names.
|
|
char convertString[128];
|
|
char inName[32];
|
|
char outName[32];
|
|
switch (vectorSizetmp)
|
|
{
|
|
case 1:
|
|
strncpy(inName, gTypeNames[inType], sizeof(inName));
|
|
strncpy(outName, gTypeNames[outType], sizeof(outName));
|
|
snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
|
|
outName, gSaturationNames[sat],
|
|
gRoundingModeNames[round]);
|
|
snprintf(testName, 256, "test_%s_%s", convertString, inName);
|
|
vlog("Building %s( %s ) test\n", convertString, inName);
|
|
break;
|
|
case 3:
|
|
strncpy(inName, gTypeNames[inType], sizeof(inName));
|
|
strncpy(outName, gTypeNames[outType], sizeof(outName));
|
|
snprintf(convertString, sizeof(convertString),
|
|
"convert_%s3%s%s", outName, gSaturationNames[sat],
|
|
gRoundingModeNames[round]);
|
|
snprintf(testName, 256, "test_%s_%s3", convertString, inName);
|
|
vlog("Building %s( %s3 ) test\n", convertString, inName);
|
|
break;
|
|
default:
|
|
snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
|
|
vectorSizetmp);
|
|
snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
|
|
vectorSizetmp);
|
|
snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
|
|
outName, gSaturationNames[sat],
|
|
gRoundingModeNames[round]);
|
|
snprintf(testName, 256, "test_%s_%s", convertString, inName);
|
|
vlog("Building %s( %s ) test\n", convertString, inName);
|
|
break;
|
|
}
|
|
fflush(stdout);
|
|
|
|
if (vectorSizetmp == 3)
|
|
{
|
|
source << "__kernel void " << testName << "( __global " << inName
|
|
<< " *src, __global " << outName << " *dest )\n";
|
|
source << "{\n";
|
|
source << " size_t i = get_global_id(0);\n";
|
|
source << " if( i + 1 < get_global_size(0))\n";
|
|
source << " vstore3( " << convertString
|
|
<< "( vload3( i, src)), i, dest );\n";
|
|
source << " else\n";
|
|
source << " {\n";
|
|
source << " " << inName << "3 in;\n";
|
|
source << " " << outName << "3 out;\n";
|
|
source << " if( 0 == (i & 1) )\n";
|
|
source << " in.y = src[3*i+1];\n";
|
|
source << " in.x = src[3*i];\n";
|
|
source << " out = " << convertString << "( in ); \n";
|
|
source << " dest[3*i] = out.x;\n";
|
|
source << " if( 0 == (i & 1) )\n";
|
|
source << " dest[3*i+1] = out.y;\n";
|
|
source << " }\n";
|
|
source << "}\n";
|
|
}
|
|
else
|
|
{
|
|
source << "__kernel void " << testName << "( __global " << inName
|
|
<< " *src, __global " << outName << " *dest )\n";
|
|
source << "{\n";
|
|
source << " size_t i = get_global_id(0);\n";
|
|
source << " dest[i] = " << convertString << "( src[i] );\n";
|
|
source << "}\n";
|
|
}
|
|
}
|
|
*outKernel = NULL;
|
|
|
|
const char *flags = NULL;
|
|
if (gForceFTZ) flags = "-cl-denorms-are-zero";
|
|
|
|
// build it
|
|
std::string sourceString = source.str();
|
|
const char *programSource = sourceString.c_str();
|
|
error = create_single_kernel_helper(gContext, &program, outKernel, 1,
|
|
&programSource, testName, flags);
|
|
if (error)
|
|
{
|
|
vlog_error("Failed to build kernel/program (err = %d).\n", error);
|
|
clReleaseProgram(program);
|
|
return NULL;
|
|
}
|
|
|
|
return program;
|
|
}
|