mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 22:19:02 +00:00
The maintenance of the conformance tests is moving to Github. This commit contains all the changes that have been done in Gitlab since the first public release of the conformance tests. Signed-off-by: Kevin Petit <kevin.petit@arm.com>
1860 lines
65 KiB
C
1860 lines
65 KiB
C
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include "../../test_common/harness/compat.h"
|
|
#include "../../test_common/harness/rounding_mode.h"
|
|
#include "../../test_common/harness/ThreadPool.h"
|
|
#include "../../test_common/harness/parseParameters.h"
|
|
#if defined (_WIN32)
|
|
#define MAX(x,y) ((x>y)?x:y);
|
|
#define MIN(x,y) ((x<y)?x:y);
|
|
#endif
|
|
#if !defined(_WIN32)
|
|
#include <sys/sysctl.h>
|
|
#endif
|
|
|
|
#if defined( __linux__ )
|
|
#include <unistd.h>
|
|
#include <sys/syscall.h>
|
|
#include <linux/sysctl.h>
|
|
#endif
|
|
#if defined(__linux__)
|
|
#include <sys/param.h>
|
|
#include <libgen.h>
|
|
#endif
|
|
|
|
#include "mingw_compat.h"
|
|
#if defined(__MINGW32__)
|
|
#include <sys/param.h>
|
|
#endif
|
|
|
|
#include <stdarg.h>
|
|
#include <stdio.h>
|
|
#if !defined(_WIN32)
|
|
#include <stdint.h>
|
|
#endif
|
|
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <string.h>
|
|
#if !defined(_WIN32)
|
|
#include <libgen.h>
|
|
#include <sys/mman.h>
|
|
#endif
|
|
#include <time.h>
|
|
|
|
#include "Sleep.h"
|
|
#include "basic_test_conversions.h"
|
|
|
|
#pragma STDC FENV_ACCESS on
|
|
#if !(defined(_WIN32) && defined(_MSC_VER))
|
|
#include <fenv.h>
|
|
#endif
|
|
|
|
#if (defined(_WIN32) && defined (_MSC_VER))
|
|
// need for _controlfp_s and rouinding modes in RoundingMode
|
|
#include <float.h>
|
|
#include "../../test_common/harness/testHarness.h"
|
|
#endif
|
|
|
|
#pragma mark -
|
|
#pragma mark globals
|
|
|
|
#define BUFFER_SIZE (1024*1024)
|
|
#define kPageSize 4096
|
|
#define PERF_LOOP_COUNT 100
|
|
|
|
#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
|
|
|
|
const char ** argList = NULL;
|
|
int argCount = 0;
|
|
cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
|
|
cl_device_id gDevice = NULL;
|
|
cl_context gContext = NULL;
|
|
cl_command_queue gQueue = NULL;
|
|
char appName[64] = "ctest";
|
|
int gTestCount = 0;
|
|
int gFailCount = 0;
|
|
int gStartTestNumber = -1;
|
|
int gEndTestNumber = 0;
|
|
#if defined( __APPLE__ )
|
|
int gTimeResults = 1;
|
|
#else
|
|
int gTimeResults = 0;
|
|
#endif
|
|
int gReportAverageTimes = 0;
|
|
void *gIn = NULL;
|
|
void *gRef = NULL;
|
|
void *gAllowZ = NULL;
|
|
void *gOut[ kCallStyleCount ] = { NULL };
|
|
cl_mem gInBuffer;
|
|
cl_mem gOutBuffers[ kCallStyleCount ];
|
|
size_t gComputeDevices = 0;
|
|
uint32_t gDeviceFrequency = 0;
|
|
int gWimpyMode = 0;
|
|
int gWimpyReductionFactor = 128;
|
|
int gSkipTesting = 0;
|
|
int gForceFTZ = 0;
|
|
int gMultithread = 1;
|
|
int gIsRTZ = 0;
|
|
uint32_t gSimdSize = 1;
|
|
int gHasDouble = 0;
|
|
int gIsEmbedded = 0;
|
|
int gHasLong = 1;
|
|
int gTestDouble = 1;
|
|
cl_uint choosen_device_index = 0;
|
|
const char * sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
|
|
const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
|
|
int gMinVectorSize = 0;
|
|
int gMaxVectorSize = sizeof(vectorSizes) / sizeof( vectorSizes[0] );
|
|
|
|
#pragma mark -
|
|
#pragma mark Declarations
|
|
|
|
static int ParseArgs( int argc, const char **argv );
|
|
static void PrintUsage( void );
|
|
static void PrintArch(void);
|
|
static int InitCL( void );
|
|
static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round );
|
|
static int DoTest( Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d );
|
|
static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel );
|
|
static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount );
|
|
|
|
void *FlushToZero( void );
|
|
void UnFlushToZero( void *);
|
|
|
|
static cl_program CreateImplicitConvertProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );
|
|
static cl_program CreateStandardProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );
|
|
|
|
|
|
// Windows (since long double got deprecated) sets the x87 to 53-bit precision
|
|
// (that's x87 default state). This causes problems with the tests that
|
|
// convert long and ulong to float and double or otherwise deal with values
|
|
// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
|
|
static inline void Force64BitFPUPrecision(void)
|
|
{
|
|
#if __MINGW32__
|
|
// The usual method is to use _controlfp as follows:
|
|
// #include <float.h>
|
|
// _controlfp(_PC_64, _MCW_PC);
|
|
//
|
|
// _controlfp is available on MinGW32 but not on MinGW64. Instead of having
|
|
// divergent code just use inline assembly which works for both.
|
|
unsigned short int orig_cw = 0;
|
|
unsigned short int new_cw = 0;
|
|
__asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
|
|
new_cw = orig_cw | 0x0300; // set precision to 64-bit
|
|
__asm__ __volatile__ ("fldcw %0"::"m" (new_cw));
|
|
#else
|
|
/* Implement for other platforms if needed */
|
|
#endif
|
|
}
|
|
|
|
|
|
#pragma mark -
|
|
|
|
int main (int argc, const char **argv )
|
|
{
|
|
int error, i, testNumber = -1;
|
|
Type inType, outType;
|
|
RoundingMode round;
|
|
SaturationMode sat;
|
|
MTdata d = NULL;
|
|
cl_uint seed = (cl_uint) clock();
|
|
|
|
test_start();
|
|
|
|
if( (error = ParseArgs( argc, argv )) )
|
|
return error;
|
|
|
|
//Turn off sleep so our tests run to completion
|
|
PreventSleep();
|
|
atexit( ResumeSleep );
|
|
|
|
// Init CL data structures
|
|
if( (error = InitCL()) )
|
|
return error;
|
|
|
|
#if defined(_MSC_VER) && defined(_M_IX86)
|
|
// VS2005 (and probably others, since long double got deprecated) sets
|
|
// the x87 to 53-bit precision. This causes problems with the tests
|
|
// that convert long and ulong to float and double, since they deal
|
|
// with values that need more precision than that. So, set the x87
|
|
// to 64-bit precision.
|
|
unsigned int ignored;
|
|
_controlfp_s(&ignored, _PC_64, _MCW_PC);
|
|
#endif
|
|
|
|
vlog( "===========================================================\n" );
|
|
vlog( "Random seed: %u\n", seed );
|
|
d = init_genrand( seed );
|
|
int startMinVectorSize = gMinVectorSize;
|
|
if( argCount )
|
|
{
|
|
for( i = 0; i < argCount; i++ )
|
|
{
|
|
if( GetTestCase( argList[i], &outType, &inType, &sat, &round ) )
|
|
{
|
|
vlog_error( "\n\t\t**** ERROR: Unable to parse function name %s. Skipping.... *****\n\n", argList[i] );
|
|
continue;
|
|
}
|
|
|
|
// skip double if we don't have it
|
|
if( !gTestDouble && (inType == kdouble || outType == kdouble ) )
|
|
{
|
|
if( gHasDouble )
|
|
{
|
|
vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
|
|
vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// skip longs on embedded
|
|
if( ! gHasLong &&
|
|
(inType == klong || outType == klong || inType == kulong || outType == kulong))
|
|
continue;
|
|
|
|
// Skip the implicit converts if the rounding mode is not default or test is saturated
|
|
if( 0 == startMinVectorSize )
|
|
{
|
|
if( sat || round != kDefaultRoundingMode )
|
|
gMinVectorSize = 1;
|
|
else
|
|
gMinVectorSize = 0;
|
|
}
|
|
|
|
if( (error = DoTest( outType, inType, sat, round, d )) )
|
|
vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
|
|
for( outType = (Type)0; outType < kTypeCount; outType = (Type)(outType+1) )
|
|
{
|
|
for( inType = (Type)0; inType < kTypeCount; inType = (Type)(inType+1) )
|
|
{
|
|
// skip longs on embedded
|
|
if( ! gHasLong &&
|
|
(inType == klong || outType == klong || inType == kulong || outType == kulong))
|
|
continue;
|
|
|
|
for( sat = (SaturationMode)0; sat < kSaturationModeCount; sat = (SaturationMode)(sat+1) )
|
|
{
|
|
//skip illegal saturated conversions to float type
|
|
if( kSaturated == sat && ( outType == kfloat || outType == kdouble ) )
|
|
continue;
|
|
|
|
for( round = (RoundingMode)0; round < kRoundingModeCount; round = (RoundingMode)(round+1) )
|
|
{
|
|
if( ++testNumber < gStartTestNumber )
|
|
{
|
|
// vlog( "%d) skipping convert_%sn%s%s( %sn )\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
|
|
continue;
|
|
}
|
|
else
|
|
if( gEndTestNumber > 0 && testNumber >= gEndTestNumber )
|
|
goto exit;
|
|
|
|
vlog( "%d) Testing convert_%sn%s%s( %sn ):\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
|
|
|
|
// skip double if we don't have it
|
|
if( ! gTestDouble && (inType == kdouble || outType == kdouble ) )
|
|
{
|
|
if( gHasDouble )
|
|
{
|
|
vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
|
|
vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Skip the implicit converts if the rounding mode is not default or test is saturated
|
|
if( 0 == startMinVectorSize )
|
|
{
|
|
if( sat || round != kDefaultRoundingMode )
|
|
gMinVectorSize = 1;
|
|
else
|
|
gMinVectorSize = 0;
|
|
}
|
|
|
|
if( (error = DoTest( outType, inType, sat, round, d) ) )
|
|
vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
exit:
|
|
free_mtdata(d);
|
|
vlog( "\n\n" );
|
|
vlog( "Tests completed: %d\n", gTestCount );
|
|
|
|
error = clFinish(gQueue);
|
|
if (error)
|
|
vlog_error("clFinish failed: %d\n", error);
|
|
|
|
if (gFailCount == 0 && gTestCount > 0) {
|
|
vlog("PASSED %d of %d tests.\n", gTestCount, gTestCount);
|
|
} else if (gFailCount > 0) {
|
|
vlog_error("FAILED %d of %d tests.\n", gFailCount, gTestCount);
|
|
}
|
|
|
|
clReleaseMemObject(gInBuffer);
|
|
|
|
for( i = 0; i < kCallStyleCount; i++ ) {
|
|
clReleaseMemObject(gOutBuffers[i]);
|
|
}
|
|
clReleaseCommandQueue(gQueue);
|
|
clReleaseContext(gContext);
|
|
|
|
test_finish();
|
|
if (gFailCount > 0)
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#pragma mark -
|
|
#pragma mark setup
|
|
|
|
static int ParseArgs( int argc, const char **argv )
|
|
{
|
|
int i;
|
|
argList = (const char **)calloc( argc - 1, sizeof( char*) );
|
|
argCount = 0;
|
|
|
|
if( NULL == argList && argc > 1 )
|
|
return -1;
|
|
|
|
#if (defined( __APPLE__ ) || defined(__linux__) || defined (__MINGW32__))
|
|
{ // Extract the app name
|
|
char baseName[ MAXPATHLEN ];
|
|
strncpy( baseName, argv[0], MAXPATHLEN );
|
|
char *base = basename( baseName );
|
|
if( NULL != base )
|
|
{
|
|
strncpy( appName, base, sizeof( appName ) );
|
|
appName[ sizeof( appName ) -1 ] = '\0';
|
|
}
|
|
}
|
|
#elif defined (_WIN32)
|
|
{
|
|
char fname[_MAX_FNAME + _MAX_EXT + 1];
|
|
char ext[_MAX_EXT];
|
|
|
|
errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
|
|
fname, _MAX_FNAME, ext, _MAX_EXT );
|
|
if (err == 0) { // no error
|
|
strcat (fname, ext); //just cat them, size of frame can keep both
|
|
strncpy (appName, fname, sizeof(appName));
|
|
appName[ sizeof( appName ) -1 ] = '\0';
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* Check for environment variable to set device type */
|
|
char *env_mode = getenv( "CL_DEVICE_TYPE" );
|
|
if( env_mode != NULL )
|
|
{
|
|
vlog( "CL_DEVICE_TYPE: %s\n", env_mode );
|
|
if( strcmp( env_mode, "gpu" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_GPU" ) == 0 )
|
|
gDeviceType = CL_DEVICE_TYPE_GPU;
|
|
else if( strcmp( env_mode, "cpu" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_CPU" ) == 0 )
|
|
gDeviceType = CL_DEVICE_TYPE_CPU;
|
|
else if( strcmp( env_mode, "accelerator" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
|
|
gDeviceType = CL_DEVICE_TYPE_ACCELERATOR;
|
|
else if( strcmp( env_mode, "default" ) == 0 || strcmp( env_mode, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
|
|
gDeviceType = CL_DEVICE_TYPE_DEFAULT;
|
|
else
|
|
{
|
|
vlog_error( "Unknown CL_DEVICE_TYPE env variable setting: %s.\nAborting...\n", env_mode );
|
|
abort();
|
|
}
|
|
}
|
|
|
|
|
|
vlog( "\n%s", appName );
|
|
for( i = 1; i < argc; i++ )
|
|
{
|
|
const char *arg = argv[i];
|
|
if( NULL == arg )
|
|
break;
|
|
|
|
vlog( "\t%s", arg );
|
|
if( arg[0] == '-' )
|
|
{
|
|
arg++;
|
|
while( *arg != '\0' )
|
|
{
|
|
switch( *arg )
|
|
{
|
|
case 'd':
|
|
gTestDouble ^= 1;
|
|
break;
|
|
case 'l':
|
|
gSkipTesting ^= 1;
|
|
break;
|
|
case 'm':
|
|
gMultithread ^= 1;
|
|
break;
|
|
case 'w':
|
|
gWimpyMode ^= 1;
|
|
break;
|
|
case '[':
|
|
parseWimpyReductionFactor(arg, gWimpyReductionFactor);
|
|
break;
|
|
case 'z':
|
|
gForceFTZ ^= 1;
|
|
break;
|
|
case 't':
|
|
gTimeResults ^= 1;
|
|
break;
|
|
case 'a':
|
|
gReportAverageTimes ^= 1;
|
|
break;
|
|
case '1':
|
|
if( arg[1] == '6' )
|
|
{
|
|
gMinVectorSize = 6;
|
|
gMaxVectorSize = 7;
|
|
arg++;
|
|
}
|
|
else
|
|
{
|
|
gMinVectorSize = 0;
|
|
gMaxVectorSize = 2;
|
|
}
|
|
break;
|
|
|
|
case '2':
|
|
gMinVectorSize = 2;
|
|
gMaxVectorSize = 3;
|
|
break;
|
|
|
|
case '3':
|
|
gMinVectorSize = 3;
|
|
gMaxVectorSize = 4;
|
|
break;
|
|
|
|
case '4':
|
|
gMinVectorSize = 4;
|
|
gMaxVectorSize = 5;
|
|
break;
|
|
|
|
case '8':
|
|
gMinVectorSize = 5;
|
|
gMaxVectorSize = 6;
|
|
break;
|
|
|
|
default:
|
|
vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
|
|
PrintUsage();
|
|
return -1;
|
|
}
|
|
arg++;
|
|
}
|
|
}
|
|
// Check if a particular device id was requested
|
|
else if (strlen(argv[i]) >= 3 && argv[i][0] == 'i' && argv[i][1] =='d')
|
|
{
|
|
choosen_device_index = atoi(&(argv[i][2]));
|
|
}
|
|
else
|
|
{
|
|
char *t = NULL;
|
|
long number = strtol( arg, &t, 0 );
|
|
if( t != arg )
|
|
{
|
|
if( gStartTestNumber != -1 )
|
|
gEndTestNumber = gStartTestNumber + (int) number;
|
|
else
|
|
gStartTestNumber = (int) number;
|
|
}
|
|
else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_CPU"))
|
|
gDeviceType = CL_DEVICE_TYPE_CPU;
|
|
else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_GPU"))
|
|
gDeviceType = CL_DEVICE_TYPE_GPU;
|
|
else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_ACCELERATOR"))
|
|
gDeviceType = CL_DEVICE_TYPE_ACCELERATOR;
|
|
else if( 0 == strcmp( arg, "CL_DEVICE_TYPE_DEFAULT"))
|
|
gDeviceType = CL_DEVICE_TYPE_DEFAULT;
|
|
else
|
|
{
|
|
argList[ argCount ] = arg;
|
|
argCount++;
|
|
}
|
|
}
|
|
}
|
|
|
|
vlog( "\n" );
|
|
|
|
vlog( "Test binary built %s %s\n", __DATE__, __TIME__ );
|
|
|
|
PrintArch();
|
|
|
|
if( gWimpyMode )
|
|
{
|
|
vlog( "\n" );
|
|
vlog( "*** WARNING: Testing in Wimpy mode! ***\n" );
|
|
vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
|
|
vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" );
|
|
vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void PrintUsage( void )
|
|
{
|
|
int i;
|
|
vlog( "%s [-wz#]: <optional: test names>\n", appName );
|
|
vlog( "\ttest names:\n" );
|
|
vlog( "\t\tdestFormat<_sat><_round>_sourceFormat\n" );
|
|
vlog( "\t\t\tPossible format types are:\n\t\t\t\t" );
|
|
for( i = 0; i < kTypeCount; i++ )
|
|
vlog( "%s, ", gTypeNames[i] );
|
|
vlog( "\n\n\t\t\tPossible saturation values are: (empty) and _sat\n" );
|
|
vlog( "\t\t\tPossible rounding values are:\n\t\t\t\t(empty), " );
|
|
for( i = 1; i < kRoundingModeCount; i++ )
|
|
vlog( "%s, ", gRoundingModeNames[i] );
|
|
vlog( "\n\t\t\tExamples:\n" );
|
|
vlog( "\t\t\t\tulong_short converts short to ulong\n" );
|
|
vlog( "\t\t\t\tchar_sat_rte_float converts float to char with saturated clipping in round to nearest rounding mode\n\n" );
|
|
vlog( "\toptions:\n" );
|
|
vlog( "\t\t-d\tToggle testing of double precision. On by default if cl_khr_fp64 is enabled, ignored otherwise.\n" );
|
|
vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" );
|
|
vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" );
|
|
vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" );
|
|
vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
|
|
vlog( "\t\t-z\tToggle flush to zero mode (Default: per device)\n" );
|
|
vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" );
|
|
vlog( "\n" );
|
|
vlog( "You may also pass the number of the test on which to start.\nA second number can be then passed to indicate how many tests to run\n\n" );
|
|
}
|
|
|
|
static void PrintArch( void )
|
|
{
|
|
vlog( "sizeof( void*) = %ld\n", sizeof( void *) );
|
|
#if defined( __ppc__ )
|
|
vlog( "ARCH:\tppc\n" );
|
|
#elif defined( __ppc64__ )
|
|
vlog( "ARCH:\tppc64\n" );
|
|
#elif defined( __PPC__ )
|
|
vlog( "ARCH:\tppc\n" );
|
|
#elif defined( __i386__ )
|
|
vlog( "ARCH:\ti386\n" );
|
|
#elif defined( __x86_64__ )
|
|
vlog( "ARCH:\tx86_64\n" );
|
|
#elif defined( __arm__ )
|
|
vlog( "ARCH:\tarm\n" );
|
|
#elif defined( __aarch64__ )
|
|
vlog( "ARCH:\taarch64\n" );
|
|
#elif defined (_WIN32)
|
|
vlog( "ARCH:\tWindows\n" );
|
|
#else
|
|
#error unknown arch
|
|
#endif
|
|
|
|
#if defined( __APPLE__ )
|
|
|
|
int type = 0;
|
|
size_t typeSize = sizeof( type );
|
|
sysctlbyname( "hw.cputype", &type, &typeSize, NULL, 0 );
|
|
vlog( "cpu type:\t%d\n", type );
|
|
typeSize = sizeof( type );
|
|
sysctlbyname( "hw.cpusubtype", &type, &typeSize, NULL, 0 );
|
|
vlog( "cpu subtype:\t%d\n", type );
|
|
|
|
#elif defined( __linux__ )
|
|
#define OSNAMESZ 100
|
|
int _sysctl(struct __sysctl_args *args );
|
|
|
|
struct __sysctl_args args;
|
|
char osname[OSNAMESZ];
|
|
size_t osnamelth;
|
|
int name[] = { CTL_KERN, KERN_OSTYPE };
|
|
memset(&args, 0, sizeof(struct __sysctl_args));
|
|
args.name = name;
|
|
args.nlen = sizeof(name)/sizeof(name[0]);
|
|
args.oldval = osname;
|
|
args.oldlenp = &osnamelth;
|
|
|
|
osnamelth = sizeof(osname);
|
|
|
|
if (syscall(SYS__sysctl, &args) == -1) {
|
|
vlog( "_sysctl error\n" );
|
|
}
|
|
else {
|
|
vlog("this machine is running %*s\n", osnamelth, osname);
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round )
|
|
{
|
|
int i;
|
|
|
|
//Find the return type
|
|
for( i = 0; i < kTypeCount; i++ )
|
|
if( name == strstr( name, gTypeNames[i] ) )
|
|
{
|
|
*outType = (Type)i;
|
|
name += strlen( gTypeNames[i] );
|
|
|
|
break;
|
|
}
|
|
|
|
if( i == kTypeCount )
|
|
return -1;
|
|
|
|
// Check to see if _sat appears next
|
|
*sat = (SaturationMode)0;
|
|
for( i = 1; i < kSaturationModeCount; i++ )
|
|
if( name == strstr( name, gSaturationNames[i] ) )
|
|
{
|
|
*sat = (SaturationMode)i;
|
|
name += strlen( gSaturationNames[i] );
|
|
break;
|
|
}
|
|
|
|
*round = (RoundingMode)0;
|
|
for( i = 1; i < kRoundingModeCount; i++ )
|
|
if( name == strstr( name, gRoundingModeNames[i] ) )
|
|
{
|
|
*round = (RoundingMode)i;
|
|
name += strlen( gRoundingModeNames[i] );
|
|
break;
|
|
}
|
|
|
|
if( *name != '_' )
|
|
return -2;
|
|
name++;
|
|
|
|
for( i = 0; i < kTypeCount; i++ )
|
|
if( name == strstr( name, gTypeNames[i] ) )
|
|
{
|
|
*inType = (Type)i;
|
|
name += strlen( gTypeNames[i] );
|
|
|
|
break;
|
|
}
|
|
|
|
if( i == kTypeCount )
|
|
return -3;
|
|
|
|
if( *name != '\0' )
|
|
return -4;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#pragma mark -
|
|
#pragma mark OpenCL
|
|
|
|
static void CL_CALLBACK notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
|
|
{
|
|
vlog( "%s\n", errinfo );
|
|
}
|
|
|
|
static int InitCL( void )
|
|
{
|
|
int error, i;
|
|
size_t configSize = sizeof( gComputeDevices );
|
|
|
|
cl_platform_id platform = NULL;
|
|
cl_uint num_devices = 0;
|
|
cl_device_id *devices = NULL;
|
|
|
|
/* Get the platform */
|
|
error = clGetPlatformIDs(1, &platform, NULL);
|
|
if (error) {
|
|
vlog_error( "clGetPlatformIDs failed: %d\n", error );
|
|
return error;
|
|
}
|
|
|
|
/* Get the number of requested devices */
|
|
error = clGetDeviceIDs(platform, gDeviceType, 0, NULL, &num_devices );
|
|
if (error) {
|
|
vlog_error( "clGetDeviceIDs failed: %d\n", error );
|
|
return error;
|
|
}
|
|
|
|
devices = (cl_device_id *) malloc( num_devices * sizeof( cl_device_id ) );
|
|
if (!devices || choosen_device_index >= num_devices) {
|
|
vlog_error( "device index out of range -- choosen_device_index (%d) >= num_devices (%d)\n", choosen_device_index, num_devices );
|
|
return -1;
|
|
}
|
|
|
|
/* Get the requested device */
|
|
error = clGetDeviceIDs(platform, gDeviceType, num_devices, devices, NULL );
|
|
if (error) {
|
|
vlog_error( "clGetDeviceIDs failed: %d\n", error );
|
|
return error;
|
|
}
|
|
|
|
gDevice = devices[choosen_device_index];
|
|
free(devices);
|
|
devices = NULL;
|
|
|
|
if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) )
|
|
gComputeDevices = 1;
|
|
|
|
configSize = sizeof( gDeviceFrequency );
|
|
if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) )
|
|
gDeviceFrequency = 0;
|
|
|
|
cl_device_fp_config floatCapabilities = 0;
|
|
if( (error = clGetDeviceInfo(gDevice, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities, NULL)))
|
|
floatCapabilities = 0;
|
|
if(0 == (CL_FP_DENORM & floatCapabilities) )
|
|
gForceFTZ ^= 1;
|
|
|
|
if( 0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST ) )
|
|
{
|
|
char profileStr[128] = "";
|
|
// Verify that we are an embedded profile device
|
|
if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_PROFILE, sizeof( profileStr ), profileStr, NULL ) ) )
|
|
{
|
|
vlog_error( "FAILURE: Could not get device profile: error %d\n", error );
|
|
return -1;
|
|
}
|
|
|
|
if( strcmp( profileStr, "EMBEDDED_PROFILE" ) )
|
|
{
|
|
vlog_error( "FAILURE: non-embedded profile device does not support CL_FP_ROUND_TO_NEAREST\n" );
|
|
return -1;
|
|
}
|
|
|
|
if( 0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO ) )
|
|
{
|
|
vlog_error( "FAILURE: embedded profile device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" );
|
|
return -1;
|
|
}
|
|
|
|
gIsRTZ = 1;
|
|
}
|
|
|
|
char extensions[2048] = "";
|
|
if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_EXTENSIONS, sizeof( extensions ), extensions, NULL ) ) )
|
|
{
|
|
vlog_error( "FAILURE: unable to get device info for CL_DEVICE_EXTENSIONS!" );
|
|
return -1;
|
|
}
|
|
else if( strstr( extensions, "cl_khr_fp64" ) )
|
|
{
|
|
gHasDouble = 1;
|
|
}
|
|
gTestDouble &= gHasDouble;
|
|
|
|
//detect whether profile of the device is embedded
|
|
char profile[1024] = "";
|
|
if( (error = clGetDeviceInfo( gDevice, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ) ) ){}
|
|
else if( strstr(profile, "EMBEDDED_PROFILE" ) )
|
|
{
|
|
gIsEmbedded = 1;
|
|
if( !strstr( extensions, "cles_khr_int64" ) )
|
|
gHasLong = 0;
|
|
}
|
|
|
|
|
|
gContext = clCreateContext( NULL, 1, &gDevice, notify_callback, NULL, &error );
|
|
if( NULL == gDevice || error )
|
|
{
|
|
vlog_error( "clCreateContext failed. (%d)\n", error );
|
|
return error;
|
|
}
|
|
|
|
gQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
|
|
if( NULL == gQueue || error )
|
|
{
|
|
vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
|
|
return error;
|
|
}
|
|
|
|
//Allocate buffers
|
|
//FIXME: use clProtectedArray for guarded allocations?
|
|
gIn = malloc( BUFFER_SIZE + 2 * kPageSize );
|
|
gAllowZ = malloc( BUFFER_SIZE + 2 * kPageSize );
|
|
gRef = malloc( BUFFER_SIZE + 2 * kPageSize );
|
|
for( i = 0; i < kCallStyleCount; i++ )
|
|
{
|
|
gOut[i] = malloc( BUFFER_SIZE + 2 * kPageSize );
|
|
if( NULL == gOut[i] )
|
|
return -3;
|
|
}
|
|
|
|
// setup input buffers
|
|
gInBuffer = clCreateBuffer(gContext, CL_MEM_READ_ONLY, BUFFER_SIZE, NULL, &error);
|
|
if( gInBuffer == NULL || error)
|
|
{
|
|
vlog_error( "clCreateBuffer failed for input (%d)\n", error );
|
|
return error;
|
|
}
|
|
|
|
// setup output buffers
|
|
for( i = 0; i < kCallStyleCount; i++ )
|
|
{
|
|
gOutBuffers[i] = clCreateBuffer( gContext, CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, &error );
|
|
if( gOutBuffers[i] == NULL || error )
|
|
{
|
|
vlog_error( "clCreateArray failed for output (%d)\n", error );
|
|
return error;
|
|
}
|
|
}
|
|
|
|
#if defined( __APPLE__ )
|
|
|
|
#if defined( __i386__ ) || defined( __x86_64__ )
|
|
#define kHasSSE3 0x00000008
|
|
#define kHasSupplementalSSE3 0x00000100
|
|
#define kHasSSE4_1 0x00000400
|
|
#define kHasSSE4_2 0x00000800
|
|
/* check our environment for a hint to disable SSE variants */
|
|
{
|
|
const char *env = getenv( "CL_MAX_SSE" );
|
|
if( env )
|
|
{
|
|
extern int _cpu_capabilities;
|
|
int mask = 0;
|
|
if( 0 == strcasecmp( env, "SSE4.1" ) )
|
|
mask = kHasSSE4_2;
|
|
else if( 0 == strcasecmp( env, "SSSE3" ) )
|
|
mask = kHasSSE4_2 | kHasSSE4_1;
|
|
else if( 0 == strcasecmp( env, "SSE3" ) )
|
|
mask = kHasSSE4_2 | kHasSSE4_1 | kHasSupplementalSSE3;
|
|
else if( 0 == strcasecmp( env, "SSE2" ) )
|
|
mask = kHasSSE4_2 | kHasSSE4_1 | kHasSupplementalSSE3 | kHasSSE3;
|
|
else
|
|
{
|
|
vlog_error( "Error: Unknown CL_MAX_SSE setting: %s\n", env );
|
|
return -2;
|
|
}
|
|
|
|
vlog( "*** Environment: CL_MAX_SSE = %s ***\n", env );
|
|
_cpu_capabilities &= ~mask;
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
|
|
char c[1024];
|
|
static const char *no_yes[] = { "NO", "YES" };
|
|
vlog( "\nCompute Device info:\n" );
|
|
clGetDeviceInfo(gDevice, CL_DEVICE_NAME, sizeof(c), c, NULL);
|
|
vlog( "\tDevice Name: %s\n", c );
|
|
clGetDeviceInfo(gDevice, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
|
|
vlog( "\tVendor: %s\n", c );
|
|
clGetDeviceInfo(gDevice, CL_DEVICE_VERSION, sizeof(c), c, NULL);
|
|
vlog( "\tDevice Version: %s\n", c );
|
|
clGetDeviceInfo(gDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
|
|
vlog( "\tCL C Version: %s\n", c );
|
|
clGetDeviceInfo(gDevice, CL_DRIVER_VERSION, sizeof(c), c, NULL);
|
|
vlog( "\tDriver Version: %s\n", c );
|
|
vlog( "\tProcessing with %ld devices\n", gComputeDevices );
|
|
vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency );
|
|
vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
|
|
vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ] );
|
|
vlog( "\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ] );
|
|
vlog( "\tHas Double? %s\n", no_yes[0 != gHasDouble] );
|
|
if( gHasDouble )
|
|
vlog( "\tTest Double? %s\n", no_yes[0 != gTestDouble] );
|
|
vlog( "\tHas Long? %s\n", no_yes[0 != gHasLong] );
|
|
vlog( "\tTesting vector sizes: " );
|
|
for( i = gMinVectorSize; i < gMaxVectorSize; i++ )
|
|
vlog("\t%d", vectorSizes[i]);
|
|
vlog( "\n" );
|
|
return 0;
|
|
}
|
|
|
|
static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount )
|
|
{
|
|
// The global dimensions are just the blockCount to execute since we haven't set up multiple queues for multiple devices.
|
|
int error;
|
|
|
|
error = clSetKernelArg(kernel, 0, sizeof( inBuf ), &inBuf);
|
|
error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
|
|
|
|
if( error )
|
|
{
|
|
vlog_error( "FAILED -- could not set kernel args (%d)\n", error );
|
|
return error;
|
|
}
|
|
|
|
if( (error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, NULL, 0, NULL, NULL)))
|
|
{
|
|
vlog_error( "FAILED -- could not execute kernel (%d)\n", error );
|
|
return error;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if ! defined( __APPLE__ )
|
|
static void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
|
|
static void memset_pattern4(void *dest, const void *src_pattern, size_t bytes )
|
|
{
|
|
uint32_t pat = ((uint32_t*) src_pattern)[0];
|
|
size_t count = bytes / 4;
|
|
size_t i;
|
|
uint32_t *d = (uint32_t *)dest;
|
|
|
|
for( i = 0; i < count; i++ )
|
|
d[i] = pat;
|
|
|
|
d += i;
|
|
|
|
bytes &= 3;
|
|
if( bytes )
|
|
memcpy( d, src_pattern, bytes );
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined( __APPLE__ )
|
|
#include <mach/mach_time.h>
|
|
#endif
|
|
|
|
uint64_t GetTime( void );
|
|
uint64_t GetTime( void )
|
|
{
|
|
#if defined( __APPLE__ )
|
|
return mach_absolute_time();
|
|
#elif defined(_MSC_VER)
|
|
return ReadTime();
|
|
#else
|
|
//mach_absolute_time is a high precision timer with precision < 1 microsecond.
|
|
#warning need accurate clock here. Times are invalid.
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
|
|
#if defined (_MSC_VER)
|
|
/* function is defined in "compat.h" */
|
|
#else
|
|
double SubtractTime( uint64_t endTime, uint64_t startTime );
|
|
double SubtractTime( uint64_t endTime, uint64_t startTime )
|
|
{
|
|
uint64_t diff = endTime - startTime;
|
|
static double conversion = 0.0;
|
|
|
|
if( 0.0 == conversion )
|
|
{
|
|
#if defined( __APPLE__ )
|
|
mach_timebase_info_data_t info = {0,0};
|
|
kern_return_t err = mach_timebase_info( &info );
|
|
if( 0 == err )
|
|
conversion = 1e-9 * (double) info.numer / (double) info.denom;
|
|
#else
|
|
// This function consumes output from GetTime() above, and converts the time to secionds.
|
|
#warning need accurate ticks to seconds conversion factor here. Times are invalid.
|
|
#endif
|
|
}
|
|
|
|
// strictly speaking we should also be subtracting out timer latency here
|
|
return conversion * (double) diff;
|
|
}
|
|
#endif
|
|
|
|
typedef struct CalcReferenceValuesInfo
|
|
{
|
|
struct WriteInputBufferInfo *parent; // pointer back to the parent WriteInputBufferInfo struct
|
|
cl_kernel kernel; // the kernel for this vector size
|
|
cl_program program; // the program for this vector size
|
|
cl_uint vectorSize; // the vector size for this callback chain
|
|
void *p; // the pointer to mapped result data for this vector size
|
|
cl_int result;
|
|
}CalcReferenceValuesInfo;
|
|
|
|
typedef struct WriteInputBufferInfo
|
|
{
|
|
volatile cl_event calcReferenceValues; // user event which signals when main thread is done calculating reference values
|
|
volatile cl_event doneBarrier; // user event which signals when worker threads are done
|
|
cl_uint count; // the number of elements in the array
|
|
Type outType; // the data type of the conversion result
|
|
Type inType; // the data type of the conversion input
|
|
volatile int barrierCount;
|
|
CalcReferenceValuesInfo calcInfo[kCallStyleCount];
|
|
}WriteInputBufferInfo;
|
|
|
|
cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
|
|
cl_uint RoundUpToNextPowerOfTwo( cl_uint x )
|
|
{
|
|
if( 0 == (x & (x-1)))
|
|
return x;
|
|
|
|
while( x & (x-1) )
|
|
x &= x-1;
|
|
|
|
return x + x;
|
|
}
|
|
|
|
void CL_CALLBACK WriteInputBufferComplete( cl_event, cl_int, void * );
|
|
|
|
typedef struct DataInitInfo
|
|
{
|
|
cl_ulong start;
|
|
cl_uint size;
|
|
Type outType;
|
|
Type inType;
|
|
SaturationMode sat;
|
|
RoundingMode round;
|
|
MTdata *d;
|
|
}DataInitInfo;
|
|
|
|
cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p );
|
|
cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p )
|
|
{
|
|
DataInitInfo *info = (DataInitInfo*) p;
|
|
|
|
gInitFunctions[ info->inType ]( (char*)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, info->round,
|
|
info->outType, info->start + job_id * info->size, info->size, info->d[thread_id] );
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
|
|
{
|
|
cl_uint i;
|
|
for (i = 0; i < count; ++i)
|
|
allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
|
|
}
|
|
|
|
cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p );
|
|
cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
|
|
{
|
|
DataInitInfo *info = (DataInitInfo*) p;
|
|
cl_uint count = info->size;
|
|
Type inType = info->inType;
|
|
Type outType = info->outType;
|
|
RoundingMode round = info->round;
|
|
size_t j;
|
|
|
|
Force64BitFPUPrecision();
|
|
|
|
void *s = (cl_uchar*) gIn + job_id * count * gTypeSizes[info->inType];
|
|
void *a = (cl_uchar*) gAllowZ + job_id * count;
|
|
void *d = (cl_uchar*) gRef + job_id * count * gTypeSizes[info->outType];
|
|
|
|
if (outType != inType)
|
|
{
|
|
//create the reference while we wait
|
|
Convert f = gConversions[ outType ][ inType ];
|
|
if( info->sat )
|
|
f = gSaturatedConversions[ outType ][ inType ];
|
|
|
|
RoundingMode oldRound = set_round( round, outType );
|
|
f( d, s, count );
|
|
set_round( oldRound, outType );
|
|
|
|
// Decide if we allow a zero result in addition to the correctly rounded one
|
|
memset(a, 0, count);
|
|
if (gForceFTZ) {
|
|
if (inType == kfloat)
|
|
setAllowZ((uint8_t*)a, (uint32_t*)s, count);
|
|
if (outType == kfloat)
|
|
setAllowZ((uint8_t*)a, (uint32_t*)d, count);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Copy the input to the reference
|
|
memcpy(d, s, info->size * gTypeSizes[inType]);
|
|
}
|
|
|
|
//Patch up NaNs conversions to integer to zero -- these can be converted to any integer
|
|
if( info->outType != kfloat && info->outType != kdouble )
|
|
{
|
|
if( inType == kfloat )
|
|
{
|
|
float *inp = (float*) s;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) )
|
|
memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
|
|
}
|
|
}
|
|
if( inType == kdouble )
|
|
{
|
|
double *inp = (double*) s;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) )
|
|
memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
|
|
}
|
|
}
|
|
}
|
|
else if( inType == kfloat || inType == kdouble )
|
|
{ // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN
|
|
if( inType == kfloat && outType == kdouble )
|
|
{
|
|
float *inp = (float*) s;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) )
|
|
((double*) d)[j] = NAN;
|
|
}
|
|
}
|
|
if( inType == kdouble && outType == kfloat )
|
|
{
|
|
double *inp = (double*) s;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) )
|
|
((float*) d)[j] = NAN;
|
|
}
|
|
}
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
static int DoTest( Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d )
|
|
{
|
|
#ifdef __APPLE__
|
|
cl_ulong wall_start = mach_absolute_time();
|
|
#endif
|
|
|
|
DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
|
|
WriteInputBufferInfo writeInputBufferInfo;
|
|
int vectorSize;
|
|
int error = 0;
|
|
cl_uint threads = GetThreadCount();
|
|
uint64_t i;
|
|
|
|
gTestCount++;
|
|
size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] );
|
|
size_t step = blockCount;
|
|
uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]);
|
|
cl_event writeInputBuffer = NULL;
|
|
|
|
memset( &writeInputBufferInfo, 0, sizeof( writeInputBufferInfo ) );
|
|
init_info.d = (MTdata*)malloc( threads * sizeof( MTdata ) );
|
|
if( NULL == init_info.d )
|
|
{
|
|
vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
|
|
return -1;
|
|
}
|
|
for( i = 0; i < threads; i++ )
|
|
{
|
|
init_info.d[i] = init_genrand( genrand_int32( d ) );
|
|
if( NULL == init_info.d[i] )
|
|
{
|
|
vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
writeInputBufferInfo.outType = outType;
|
|
writeInputBufferInfo.inType = inType;
|
|
|
|
for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
writeInputBufferInfo.calcInfo[vectorSize].program = MakeProgram( outType, inType, sat, round, vectorSize,
|
|
&writeInputBufferInfo.calcInfo[vectorSize].kernel );
|
|
if( NULL == writeInputBufferInfo.calcInfo[vectorSize].program )
|
|
{
|
|
gFailCount++;
|
|
return -1;
|
|
}
|
|
if( NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel )
|
|
{
|
|
gFailCount++;
|
|
vlog_error( "\t\tFAILED -- Failed to create kernel.\n" );
|
|
return -2;
|
|
}
|
|
|
|
writeInputBufferInfo.calcInfo[vectorSize].parent = &writeInputBufferInfo;
|
|
writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
|
|
writeInputBufferInfo.calcInfo[vectorSize].result = -1;
|
|
}
|
|
|
|
if( gSkipTesting )
|
|
goto exit;
|
|
|
|
// Patch up rounding mode if default is RTZ
|
|
// We leave the part above in default rounding mode so that the right kernel is compiled.
|
|
if( round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat) )
|
|
init_info.round = round = kRoundTowardZero;
|
|
|
|
// Figure out how many elements are in a work block
|
|
|
|
// we handle 64-bit types a bit differently.
|
|
if( 8*gTypeSizes[ inType ] > 32 )
|
|
lastCase = 0x100000000ULL;
|
|
|
|
if ( gWimpyMode )
|
|
step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
|
|
vlog( "Testing... " );
|
|
fflush(stdout);
|
|
for( i = 0; i < (uint64_t)lastCase; i += step )
|
|
{
|
|
|
|
if( 0 == ( i & ((lastCase >> 3) -1))) {
|
|
vlog(".");
|
|
fflush(stdout);
|
|
}
|
|
|
|
cl_uint count = (uint32_t) MIN( blockCount, lastCase - i );
|
|
writeInputBufferInfo.count = count;
|
|
|
|
// Crate a user event to represent the status of the reference value computation completion
|
|
writeInputBufferInfo.calcReferenceValues = clCreateUserEvent( gContext, &error);
|
|
if( error || NULL == writeInputBufferInfo.calcReferenceValues )
|
|
{
|
|
vlog_error( "ERROR: Unable to create user event. (%d)\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// retain for consumption by MapOutputBufferComplete
|
|
for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
if( (error = clRetainEvent(writeInputBufferInfo.calcReferenceValues) ))
|
|
{
|
|
vlog_error( "ERROR: Unable to retain user event. (%d)\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
// Crate a user event to represent when the callbacks are done verifying correctness
|
|
writeInputBufferInfo.doneBarrier = clCreateUserEvent( gContext, &error);
|
|
if( error || NULL == writeInputBufferInfo.calcReferenceValues )
|
|
{
|
|
vlog_error( "ERROR: Unable to create user event for barrier. (%d)\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// retain for use by the callback that calls this
|
|
if( (error = clRetainEvent(writeInputBufferInfo.doneBarrier) ))
|
|
{
|
|
vlog_error( "ERROR: Unable to retain user event doneBarrier. (%d)\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Call this in a multithreaded manner
|
|
// gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d );
|
|
cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
|
|
init_info.start = i;
|
|
init_info.size = count / chunks;
|
|
if( init_info.size < 16384 )
|
|
{
|
|
chunks = RoundUpToNextPowerOfTwo(threads);
|
|
init_info.size = count / chunks;
|
|
if( init_info.size < 16384 )
|
|
{
|
|
init_info.size = count;
|
|
chunks = 1;
|
|
}
|
|
}
|
|
ThreadPool_Do(InitData, chunks, &init_info);
|
|
|
|
// Copy the results to the device
|
|
writeInputBuffer = NULL;
|
|
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, count * gTypeSizes[inType], gIn, 0, NULL, &writeInputBuffer )))
|
|
{
|
|
vlog_error( "ERROR: clEnqueueWriteBuffer failed. (%d)\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Setup completion callback for the write, which will enqueue the rest of the work
|
|
// This is somewhat gratuitous. Because this is an in order queue, we didn't really need to
|
|
// do this work in a callback. We could have done it from the main thread. Here we are
|
|
// verifying that the implementation can enqueue work from a callback, while at the same time
|
|
// also checking to make sure that the conversions work.
|
|
//
|
|
// Because the verification code is also moved to a callback, it is hoped that implementations will
|
|
// achieve a test performance improvement because they can verify the results in parallel. If the
|
|
// implementation serializes callbacks however, that won't happen. Consider it some motivation
|
|
// to do the right thing! :-)
|
|
if( (error = clSetEventCallback( writeInputBuffer, CL_COMPLETE, WriteInputBufferComplete, &writeInputBufferInfo)) )
|
|
{
|
|
vlog_error( "ERROR: clSetEventCallback failed. (%d)\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// The event can't be destroyed until the callback is called, so we can release it now.
|
|
if( (error = clReleaseEvent(writeInputBuffer) ))
|
|
{
|
|
vlog_error( "ERROR: clReleaseEvent failed. (%d)\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Make sure the work is actually running, so we don't deadlock
|
|
if( (error = clFlush( gQueue ) ) )
|
|
{
|
|
vlog_error( "clFlush failed with error %d\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
ThreadPool_Do(PrepareReference, chunks, &init_info);
|
|
|
|
// signal we are done calculating the reference results
|
|
if( (error = clSetUserEventStatus( writeInputBufferInfo.calcReferenceValues, CL_COMPLETE ) ) )
|
|
{
|
|
vlog_error( "Error: Failed to set user event status to CL_COMPLETE: %d\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Wait for the event callbacks to finish verifying correctness.
|
|
if( (error = clWaitForEvents( 1, (cl_event*) &writeInputBufferInfo.doneBarrier ) ))
|
|
{
|
|
vlog_error( "Error: Failed to wait for barrier: %d\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if( (error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues ) ))
|
|
{
|
|
vlog_error( "Error: Failed to release calcReferenceValues: %d\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
if( (error = clReleaseEvent(writeInputBufferInfo.doneBarrier ) ))
|
|
{
|
|
vlog_error( "Error: Failed to release done barrier: %d\n", error );
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
|
|
for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
if( ( error = writeInputBufferInfo.calcInfo[ vectorSize ].result ))
|
|
{
|
|
switch( inType )
|
|
{
|
|
case kuchar:
|
|
case kchar:
|
|
vlog( "Input value: 0x%2.2x ", ((unsigned char*)gIn)[error - 1] );
|
|
break;
|
|
case kushort:
|
|
case kshort:
|
|
vlog( "Input value: 0x%4.4x ", ((unsigned short*)gIn)[error - 1] );
|
|
break;
|
|
case kuint:
|
|
case kint:
|
|
vlog( "Input value: 0x%8.8x ", ((unsigned int*)gIn)[error - 1] );
|
|
break;
|
|
case kfloat:
|
|
vlog( "Input value: %a ", ((float*)gIn)[error - 1] );
|
|
break;
|
|
break;
|
|
case kulong:
|
|
case klong:
|
|
vlog( "Input value: 0x%16.16llx ", ((unsigned long long*)gIn)[error - 1] );
|
|
break;
|
|
case kdouble:
|
|
vlog( "Input value: %a ", ((double*)gIn)[error - 1]);
|
|
break;
|
|
default:
|
|
vlog_error( "Internal error at %s: %d\n", __FILE__, __LINE__ );
|
|
abort();
|
|
break;
|
|
}
|
|
|
|
// tell the user which conversion it was.
|
|
if( 0 == vectorSize )
|
|
vlog( " (implicit scalar conversion from %s to %s)\n", gTypeNames[ inType ], gTypeNames[ outType ] );
|
|
else
|
|
vlog( " (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], sizeNames[vectorSize], gSaturationNames[ sat ],
|
|
gRoundingModeNames[ round ], gTypeNames[inType], sizeNames[vectorSize] );
|
|
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
}
|
|
}
|
|
|
|
log_info( "done.\n" );
|
|
|
|
if( gTimeResults )
|
|
{
|
|
//Kick off tests for the various vector lengths
|
|
for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
size_t workItemCount = blockCount / vectorSizes[vectorSize];
|
|
if( vectorSizes[vectorSize] * gTypeSizes[outType] < 4 )
|
|
workItemCount /= 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
|
|
|
|
double sum = 0.0;
|
|
double bestTime = INFINITY;
|
|
cl_uint k;
|
|
for( k = 0; k < PERF_LOOP_COUNT; k++ )
|
|
{
|
|
uint64_t startTime = GetTime();
|
|
if( (error = RunKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
|
|
{
|
|
gFailCount++;
|
|
goto exit;
|
|
}
|
|
|
|
// Make sure OpenCL is done
|
|
if( (error = clFinish(gQueue) ) )
|
|
{
|
|
vlog_error( "Error %d at clFinish\n", error );
|
|
goto exit;
|
|
}
|
|
|
|
uint64_t endTime = GetTime();
|
|
double time = SubtractTime( endTime, startTime );
|
|
sum += time;
|
|
if( time < bestTime )
|
|
bestTime = time;
|
|
|
|
}
|
|
|
|
if( gReportAverageTimes )
|
|
bestTime = sum / PERF_LOOP_COUNT;
|
|
double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (workItemCount * vectorSizes[vectorSize]);
|
|
if( 0 == vectorSize )
|
|
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "implicit convert %s -> %s", gTypeNames[ inType ], gTypeNames[ outType ] );
|
|
else
|
|
vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "convert_%s%s%s%s( %s%s )", gTypeNames[ outType ], sizeNames[vectorSize], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType], sizeNames[vectorSize] );
|
|
}
|
|
}
|
|
|
|
if( gWimpyMode )
|
|
vlog( "\tWimp pass" );
|
|
else
|
|
vlog( "\tpassed" );
|
|
|
|
#ifdef __APPLE__
|
|
// record the run time
|
|
vlog( "\t(%f s)", 1e-9 * ( mach_absolute_time() - wall_start ) );
|
|
#endif
|
|
vlog( "\n\n" );
|
|
fflush( stdout );
|
|
|
|
|
|
exit:
|
|
//clean up
|
|
for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
clReleaseProgram( writeInputBufferInfo.calcInfo[vectorSize].program );
|
|
clReleaseKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel );
|
|
}
|
|
|
|
if( init_info.d )
|
|
{
|
|
for( i = 0; i < threads; i++ )
|
|
free_mtdata(init_info.d[i]);
|
|
free(init_info.d);
|
|
}
|
|
|
|
return error;
|
|
}
|
|
|
|
void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data );
|
|
|
|
// Note: not called reentrantly
|
|
void CL_CALLBACK WriteInputBufferComplete( cl_event e, cl_int status, void *data )
|
|
{
|
|
WriteInputBufferInfo *info = (WriteInputBufferInfo*) data;
|
|
cl_uint count = info->count;
|
|
int vectorSize;
|
|
|
|
if( CL_SUCCESS != status )
|
|
{
|
|
vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
info->barrierCount = gMaxVectorSize - gMinVectorSize;
|
|
|
|
// now that we know that the write buffer is complete, enqueue callbacks to wait for the main thread to
|
|
// finish calculating the reference results.
|
|
for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
|
|
{
|
|
size_t workItemCount = (count + vectorSizes[vectorSize] - 1) / ( vectorSizes[vectorSize]);
|
|
cl_event mapComplete = NULL;
|
|
|
|
if( (status = RunKernel( info->calcInfo[ vectorSize ].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
|
|
{
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
info->calcInfo[vectorSize].p = clEnqueueMapBuffer( gQueue, gOutBuffers[ vectorSize ], CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
|
|
0, count * gTypeSizes[ info->outType ], 0, NULL, &mapComplete, &status);
|
|
{
|
|
if( status )
|
|
{
|
|
vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
}
|
|
|
|
if( (status = clSetEventCallback( mapComplete, CL_COMPLETE, MapResultValuesComplete, info->calcInfo + vectorSize)))
|
|
{
|
|
vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
if( (status = clReleaseEvent(mapComplete)))
|
|
{
|
|
vlog_error( "ERROR: clReleaseEvent calback failed in WriteInputBufferComplete for vector size %d with status: %d\n", vectorSize, status );
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Make sure the work starts moving -- otherwise we may deadlock
|
|
if( (status = clFlush(gQueue)))
|
|
{
|
|
vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
// e was already released by the main thread. It should be destroyed automatically soon after we exit.
|
|
}
|
|
|
|
void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data );
|
|
|
|
// Note: May be called reentrantly
|
|
void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data )
|
|
{
|
|
CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data;
|
|
cl_event calcReferenceValues = info->parent->calcReferenceValues;
|
|
|
|
if( CL_SUCCESS != status )
|
|
{
|
|
vlog_error( "ERROR: MapResultValuesComplete calback failed with status: %d\n", status );
|
|
gFailCount++; // not thread safe -- being lazy here
|
|
clReleaseEvent(calcReferenceValues);
|
|
return;
|
|
}
|
|
|
|
// we know that the map is done, wait for the main thread to finish calculating the reference values
|
|
if( (status = clSetEventCallback( calcReferenceValues, CL_COMPLETE, CalcReferenceValuesComplete, data )))
|
|
{
|
|
vlog_error( "ERROR: clSetEventCallback failed in MapResultValuesComplete with status: %d\n", status );
|
|
gFailCount++; // not thread safe -- being lazy here
|
|
}
|
|
|
|
// this thread no longer needs its reference to info->calcReferenceValues, so release it
|
|
if( (status = clReleaseEvent(calcReferenceValues) ))
|
|
{
|
|
vlog_error( "ERROR: clReleaseEvent(info->calcReferenceValues) failed with status: %d\n", status );
|
|
gFailCount++; // not thread safe -- being lazy here
|
|
}
|
|
|
|
// no need to flush since we didn't enqueue anything
|
|
|
|
// e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after we exit.
|
|
}
|
|
|
|
|
|
void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data )
|
|
{
|
|
CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data;
|
|
cl_uint vectorSize = info->vectorSize;
|
|
cl_uint count = info->parent->count;
|
|
Type outType = info->parent->outType; // the data type of the conversion result
|
|
Type inType = info->parent->inType; // the data type of the conversion input
|
|
size_t j;
|
|
cl_int error;
|
|
cl_event doneBarrier = info->parent->doneBarrier;
|
|
|
|
// report spurious error condition
|
|
if( CL_SUCCESS != status )
|
|
{
|
|
vlog_error( "ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", status );
|
|
gFailCount++; // lazy about thread safety here
|
|
return;
|
|
}
|
|
|
|
// Now we know that both results have been mapped back from the device, and the
|
|
// main thread is done calculating the reference results. It is now time to check
|
|
// the results.
|
|
|
|
// verify results
|
|
void *mapped = info->p;
|
|
|
|
//Patch up NaNs conversions to integer to zero -- these can be converted to any integer
|
|
if( outType != kfloat && outType != kdouble )
|
|
{
|
|
if( inType == kfloat )
|
|
{
|
|
float *inp = (float*) gIn;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) )
|
|
memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
|
|
}
|
|
}
|
|
if( inType == kdouble )
|
|
{
|
|
double *inp = (double*) gIn;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) )
|
|
memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
|
|
}
|
|
}
|
|
}
|
|
else if( inType == kfloat || inType == kdouble )
|
|
{ // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN
|
|
if( inType == kfloat && outType == kdouble )
|
|
{
|
|
float *inp = (float*) gIn;
|
|
double *outp = (double*) mapped;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) && isnan(outp[j]) )
|
|
outp[j] = NAN;
|
|
}
|
|
}
|
|
if( inType == kdouble && outType == kfloat )
|
|
{
|
|
double *inp = (double*) gIn;
|
|
float *outp = (float*) mapped;
|
|
for( j = 0; j < count; j++ )
|
|
{
|
|
if( isnan( inp[j] ) && isnan(outp[j]) )
|
|
outp[j] = NAN;
|
|
}
|
|
}
|
|
}
|
|
|
|
if( memcmp( mapped, gRef, count * gTypeSizes[ outType ] ) )
|
|
info->result = gCheckResults[outType]( mapped, gRef, gAllowZ, count, vectorSizes[vectorSize] );
|
|
else
|
|
info->result = 0;
|
|
|
|
// Fill the output buffer with junk and release it
|
|
{
|
|
cl_uint pattern = 0xffffdead;
|
|
memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
|
|
if((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[ vectorSize ], mapped, 0, NULL, NULL)))
|
|
{
|
|
vlog_error( "ERROR: clEnqueueUnmapMemObject failed in CalcReferenceValuesComplete (%d)\n", error );
|
|
gFailCount++;
|
|
}
|
|
}
|
|
|
|
if( 1 == ThreadPool_AtomicAdd( &info->parent->barrierCount, -1) )
|
|
{
|
|
if( (status = clSetUserEventStatus( doneBarrier, CL_COMPLETE) ))
|
|
{
|
|
vlog_error( "ERROR: clSetUserEventStatus failed in CalcReferenceValuesComplete (err: %d). We're probably going to deadlock.\n", status );
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
|
|
if( (status = clReleaseEvent( doneBarrier ) ) )
|
|
{
|
|
vlog_error( "ERROR: clReleaseEvent failed in CalcReferenceValuesComplete (err: %d).\n", status );
|
|
gFailCount++;
|
|
return;
|
|
}
|
|
}
|
|
|
|
|
|
// e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after
|
|
// all the calls to CalcReferenceValuesComplete exit.
|
|
}
|
|
|
|
static cl_program CreateImplicitConvertProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error )
|
|
{
|
|
char inName[32];
|
|
char outName[32];
|
|
const char *programSource[] =
|
|
{
|
|
"", // optional pragma
|
|
"__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" dest[i] = src[i];\n"
|
|
"}\n"
|
|
};
|
|
size_t stringCount = sizeof( programSource ) / sizeof( programSource[0] );
|
|
const char **strings = programSource;
|
|
|
|
if (outType == kdouble || inType == kdouble)
|
|
programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
|
|
|
|
//create the type name
|
|
strncpy( inName, gTypeNames[ inType ], sizeof( inName ) );
|
|
strncpy( outName, gTypeNames[ outType ], sizeof( outName ) );
|
|
sprintf( testName, "test_implicit_%s_%s", outName, inName );
|
|
vlog( "Building implicit %s -> %s conversion test\n", gTypeNames[ inType ], gTypeNames[ outType ] );
|
|
fflush(stdout);
|
|
|
|
//create the program
|
|
cl_program program = clCreateProgramWithSource(gContext, (cl_uint) stringCount, strings, NULL, error);
|
|
if( NULL == program || *error )
|
|
{
|
|
vlog_error( "\t\tFAILED -- Failed to create program. (%d)\n", *error );
|
|
return NULL;
|
|
}
|
|
return program;
|
|
}
|
|
|
|
|
|
static cl_program CreateStandardProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error )
|
|
{
|
|
vectorSize = vectorSizes[ vectorSize ];
|
|
|
|
char convertString[128];
|
|
char inName[32];
|
|
char outName[32];
|
|
const char *programSource[] =
|
|
{
|
|
"", // optional pragma
|
|
"__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" dest[i] = ", convertString, "( src[i] );\n"
|
|
"}\n"
|
|
};
|
|
const char *programSourceV3[] =
|
|
{
|
|
"", // optional pragma
|
|
"__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" if( i + 1 < get_global_size(0))\n"
|
|
" vstore3( ", convertString, "( vload3( i, src)), i, dest );\n"
|
|
" else\n"
|
|
" {\n"
|
|
" ", inName, "3 in;\n"
|
|
" ", outName, "3 out;\n"
|
|
" if( 0 == (i & 1) )\n"
|
|
" in.y = src[3*i+1];\n"
|
|
" in.x = src[3*i];\n"
|
|
" out = ", convertString, "( in ); \n"
|
|
" dest[3*i] = out.x;\n"
|
|
" if( 0 == (i & 1) )\n"
|
|
" dest[3*i+1] = out.y;\n"
|
|
" }\n"
|
|
"}\n"
|
|
};
|
|
size_t stringCount = 3 == vectorSize ? sizeof( programSourceV3 ) / sizeof( programSourceV3[0] ) :
|
|
sizeof( programSource ) / sizeof( programSource[0] );
|
|
const char **strings = 3 == vectorSize ? programSourceV3 : programSource;
|
|
|
|
if (outType == kdouble || inType == kdouble) {
|
|
programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
|
|
programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
|
|
}
|
|
|
|
//create the type name
|
|
switch (vectorSize)
|
|
{
|
|
case 1:
|
|
strncpy( inName, gTypeNames[ inType ], sizeof( inName ) );
|
|
strncpy( outName, gTypeNames[ outType ], sizeof( outName ) );
|
|
snprintf( convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[ sat ], gRoundingModeNames[ round ] );
|
|
snprintf( testName, 256, "test_%s_%s", convertString, inName );
|
|
vlog( "Building %s( %s ) test\n", convertString, inName );
|
|
break;
|
|
case 3:
|
|
strncpy( inName, gTypeNames[ inType ], sizeof( inName ) );
|
|
strncpy( outName, gTypeNames[ outType ], sizeof( outName ) );
|
|
snprintf( convertString, sizeof(convertString), "convert_%s3%s%s", outName, gSaturationNames[ sat ], gRoundingModeNames[ round ] );
|
|
snprintf( testName, 256, "test_%s_%s3", convertString, inName );
|
|
vlog( "Building %s( %s3 ) test\n", convertString, inName );
|
|
break;
|
|
default:
|
|
snprintf( inName, sizeof( inName ), "%s%d", gTypeNames[ inType ], vectorSize );
|
|
snprintf( outName, sizeof( outName ), "%s%d", gTypeNames[ outType ], vectorSize );
|
|
snprintf( convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[ sat ], gRoundingModeNames[ round ] );
|
|
snprintf( testName, 256, "test_%s_%s", convertString, inName );
|
|
vlog( "Building %s( %s ) test\n", convertString, inName );
|
|
break;
|
|
}
|
|
|
|
fflush(stdout);
|
|
|
|
//create the program
|
|
cl_program program = clCreateProgramWithSource(gContext, (cl_uint) stringCount, strings, NULL, error);
|
|
if( NULL == program || *error )
|
|
{
|
|
vlog_error( "\t\tFAILED -- Failed to create program. (%d)\n", *error );
|
|
return NULL;
|
|
}
|
|
return program;
|
|
}
|
|
|
|
|
|
static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel )
|
|
{
|
|
cl_program program;
|
|
char testName[256];
|
|
int error = 0;
|
|
|
|
// Create the program. This is a bit complicated because we are trying to avoid byte and short stores.
|
|
if( 0 == vectorSize )
|
|
program = CreateImplicitConvertProgram( outType, inType, sat, round, vectorSize, testName, &error );
|
|
else
|
|
program = CreateStandardProgram( outType, inType, sat, round, vectorSize, testName, &error );
|
|
*outKernel = NULL;
|
|
|
|
const char *flags = NULL;
|
|
if( gForceFTZ )
|
|
flags = "-cl-denorms-are-zero";
|
|
|
|
// build it
|
|
if( (error = clBuildProgram( program, 1, &gDevice, flags, NULL, NULL )))
|
|
{
|
|
char buffer[2048] = "";
|
|
|
|
vlog_error("\t\tFAILED -- clBuildProgramExecutable() failed: %d\n", error);
|
|
clGetProgramBuildInfo(program, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
|
|
vlog_error("Log: %s\n", buffer);
|
|
|
|
clReleaseProgram( program );
|
|
return NULL;
|
|
}
|
|
|
|
*outKernel = clCreateKernel(program, testName, &error);
|
|
if( NULL == *outKernel || error)
|
|
{
|
|
char buffer[2048] = "";
|
|
|
|
vlog_error("\t\tFAILED -- clCreateKernel() failed (%d):\n", error);
|
|
clGetProgramBuildInfo(program, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
|
|
vlog_error("Log: %s\n", buffer);
|
|
clReleaseProgram( program );
|
|
return NULL;
|
|
}
|
|
|
|
return program;
|
|
}
|
|
|