From ea49084976274951d6108dc54638981837f7cce1 Mon Sep 17 00:00:00 2001 From: Steven Winston Date: Tue, 24 Jan 2023 08:53:18 -0800 Subject: [PATCH] Conversions (#1555) * grab latest from upstream OpenCL * Removed events for host to device data transfers * grab latest from upstream OpenCL * 1.) revert changes to CMakeLists.txt and run_batch script in hopes this will solve the CI issues for the PR. 2.) resolve the merge conflict in test_conversions.cpp * 1.) resolve 2 additional merge conflicts * 1.) resolve 1 additional merge conflicts * locally this fails clang-format but CI version seems to require it. * fix the warning. * Remove now-unused event Also, cut down a comment that is no longer accurate. Co-authored-by: Joshua Luceno Co-authored-by: Chip Davis Co-authored-by: Chip Davis --- .../conversions/test_conversions.cpp | 1422 +++++++++-------- 1 file changed, 753 insertions(+), 669 deletions(-) diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp index f7a21f21..63940455 100644 --- a/test_conformance/conversions/test_conversions.cpp +++ b/test_conformance/conversions/test_conversions.cpp @@ -23,7 +23,7 @@ #include #endif -#if defined( __linux__ ) +#if defined(__linux__) #include #include #include @@ -53,7 +53,7 @@ #include "Sleep.h" #include "basic_test_conversions.h" -#if (defined(_WIN32) && defined (_MSC_VER)) +#if (defined(_WIN32) && defined(_MSC_VER)) // need for _controlfp_s and rouinding modes in RoundingMode #include "harness/testHarness.h" #endif @@ -61,68 +61,73 @@ #pragma mark - #pragma mark globals -#define BUFFER_SIZE (1024*1024) -#define kPageSize 4096 +#define BUFFER_SIZE (1024 * 1024) +#define kPageSize 4096 #define EMBEDDED_REDUCTION_FACTOR 16 #define PERF_LOOP_COUNT 100 -#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */) +#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */) #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__) #include "fplib.h" - extern bool qcom_sat; - extern roundingMode qcom_rm; +extern bool qcom_sat; +extern roundingMode qcom_rm; #endif -const char ** argList = NULL; -int argCount = 0; -cl_context gContext = NULL; -cl_command_queue gQueue = NULL; -char appName[64] = "ctest"; -int gStartTestNumber = -1; -int gEndTestNumber = 0; -#if defined( __APPLE__ ) -int gTimeResults = 1; +const char **argList = NULL; +int argCount = 0; +cl_context gContext = NULL; +cl_command_queue gQueue = NULL; +char appName[64] = "ctest"; +int gStartTestNumber = -1; +int gEndTestNumber = 0; +#if defined(__APPLE__) +int gTimeResults = 1; #else -int gTimeResults = 0; +int gTimeResults = 0; #endif -int gReportAverageTimes = 0; -void *gIn = NULL; -void *gRef = NULL; -void *gAllowZ = NULL; -void *gOut[ kCallStyleCount ] = { NULL }; -cl_mem gInBuffer; -cl_mem gOutBuffers[ kCallStyleCount ]; -size_t gComputeDevices = 0; -uint32_t gDeviceFrequency = 0; -int gWimpyMode = 0; -int gWimpyReductionFactor = 128; -int gSkipTesting = 0; -int gForceFTZ = 0; -int gMultithread = 1; -int gIsRTZ = 0; -uint32_t gSimdSize = 1; -int gHasDouble = 0; -int gTestDouble = 1; -const char * sizeNames[] = { "", "", "2", "3", "4", "8", "16" }; -const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 }; -int gMinVectorSize = 0; -int gMaxVectorSize = sizeof(vectorSizes) / sizeof( vectorSizes[0] ); -static MTdata gMTdata; +int gReportAverageTimes = 0; +void *gIn = NULL; +void *gRef = NULL; +void *gAllowZ = NULL; +void *gOut[kCallStyleCount] = { NULL }; +cl_mem gInBuffer; +cl_mem gOutBuffers[kCallStyleCount]; +size_t gComputeDevices = 0; +uint32_t gDeviceFrequency = 0; +int gWimpyMode = 0; +int gWimpyReductionFactor = 128; +int gSkipTesting = 0; +int gForceFTZ = 0; +int gMultithread = 1; +int gIsRTZ = 0; +uint32_t gSimdSize = 1; +int gHasDouble = 0; +int gTestDouble = 1; +const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" }; +const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 }; +int gMinVectorSize = 0; +int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]); +static MTdata gMTdata; #pragma mark - #pragma mark Declarations -static int ParseArgs( int argc, const char **argv ); -static void PrintUsage( void ); -test_status InitCL( cl_device_id device ); -static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round ); -static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d ); -static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel ); -static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount ); +static int ParseArgs(int argc, const char **argv); +static void PrintUsage(void); +test_status InitCL(cl_device_id device); +static int GetTestCase(const char *name, Type *outType, Type *inType, + SaturationMode *sat, RoundingMode *round); +static int DoTest(cl_device_id device, Type outType, Type inType, + SaturationMode sat, RoundingMode round, MTdata d); +static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat, + RoundingMode round, int vectorSize, + cl_kernel *outKernel); +static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, + size_t blockCount); -void *FlushToZero( void ); -void UnFlushToZero( void *); +void *FlushToZero(void); +void UnFlushToZero(void *); // Windows (since long double got deprecated) sets the x87 to 53-bit precision // (that's x87 default state). This causes problems with the tests that @@ -139,15 +144,16 @@ static inline void Force64BitFPUPrecision(void) // divergent code just use inline assembly which works for both. unsigned short int orig_cw = 0; unsigned short int new_cw = 0; - __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw)); - new_cw = orig_cw | 0x0300; // set precision to 64-bit - __asm__ __volatile__ ("fldcw %0"::"m" (new_cw)); + __asm__ __volatile__("fstcw %0" : "=m"(orig_cw)); + new_cw = orig_cw | 0x0300; // set precision to 64-bit + __asm__ __volatile__("fldcw %0" ::"m"(new_cw)); #else /* Implement for other platforms if needed */ #endif } -int test_conversions( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements ) +int test_conversions(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) { int error, i, testNumber = -1; int startMinVectorSize = gMinVectorSize; @@ -155,109 +161,148 @@ int test_conversions( cl_device_id device, cl_context context, cl_command_queue RoundingMode round; SaturationMode sat; - if( argCount ) + if (argCount) { - for( i = 0; i < argCount; i++ ) + for (i = 0; i < argCount; i++) { - if( GetTestCase( argList[i], &outType, &inType, &sat, &round ) ) + if (GetTestCase(argList[i], &outType, &inType, &sat, &round)) { - vlog_error( "\n\t\t**** ERROR: Unable to parse function name %s. Skipping.... *****\n\n", argList[i] ); + vlog_error("\n\t\t**** ERROR: Unable to parse function name " + "%s. Skipping.... *****\n\n", + argList[i]); continue; } // skip double if we don't have it - if( !gTestDouble && (inType == kdouble || outType == kdouble ) ) + if (!gTestDouble && (inType == kdouble || outType == kdouble)) { - if( gHasDouble ) + if (gHasDouble) { - vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); - vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" ); + vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n", + gTypeNames[outType], gSaturationNames[sat], + gRoundingModeNames[round], gTypeNames[inType]); + vlog("\t\tcl_khr_fp64 enabled, but double testing turned " + "off.\n"); } continue; } // skip longs on embedded - if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) ) + if (!gHasLong + && (inType == klong || outType == klong || inType == kulong + || outType == kulong)) { continue; } - // Skip the implicit converts if the rounding mode is not default or test is saturated - if( 0 == startMinVectorSize ) + // Skip the implicit converts if the rounding mode is not default or + // test is saturated + if (0 == startMinVectorSize) { - if( sat || round != kDefaultRoundingMode ) + if (sat || round != kDefaultRoundingMode) gMinVectorSize = 1; else gMinVectorSize = 0; } - if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) ) + if ((error = DoTest(device, outType, inType, sat, round, gMTdata))) { - vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] ); + vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n", + gTypeNames[outType], gSaturationNames[sat], + gRoundingModeNames[round], gTypeNames[inType]); } } } else { - for( outType = (Type)0; outType < kTypeCount; outType = (Type)(outType+1) ) + for (outType = (Type)0; outType < kTypeCount; + outType = (Type)(outType + 1)) { - for( inType = (Type)0; inType < kTypeCount; inType = (Type)(inType+1) ) + for (inType = (Type)0; inType < kTypeCount; + inType = (Type)(inType + 1)) { // skip longs on embedded - if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) ) + if (!gHasLong + && (inType == klong || outType == klong || inType == kulong + || outType == kulong)) { continue; } - for( sat = (SaturationMode)0; sat < kSaturationModeCount; sat = (SaturationMode)(sat+1) ) + for (sat = (SaturationMode)0; sat < kSaturationModeCount; + sat = (SaturationMode)(sat + 1)) { - //skip illegal saturated conversions to float type - if( kSaturated == sat && ( outType == kfloat || outType == kdouble ) ) + // skip illegal saturated conversions to float type + if (kSaturated == sat + && (outType == kfloat || outType == kdouble)) { continue; } - for( round = (RoundingMode)0; round < kRoundingModeCount; round = (RoundingMode)(round+1) ) + for (round = (RoundingMode)0; round < kRoundingModeCount; + round = (RoundingMode)(round + 1)) { - if( ++testNumber < gStartTestNumber ) + if (++testNumber < gStartTestNumber) { - // vlog( "%d) skipping convert_%sn%s%s( %sn )\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); + // vlog( "%d) skipping convert_%sn%s%s( %sn + // )\n", testNumber, gTypeNames[ outType ], + // gSaturationNames[ sat ], + // gRoundingModeNames[round], gTypeNames[inType] + // ); continue; } else { - if( gEndTestNumber > 0 && testNumber >= gEndTestNumber ) + if (gEndTestNumber > 0 + && testNumber >= gEndTestNumber) { goto exit; } } - vlog( "%d) Testing convert_%sn%s%s( %sn ):\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); + vlog("%d) Testing convert_%sn%s%s( %sn ):\n", + testNumber, gTypeNames[outType], + gSaturationNames[sat], gRoundingModeNames[round], + gTypeNames[inType]); // skip double if we don't have it - if( ! gTestDouble && (inType == kdouble || outType == kdouble ) ) + if (!gTestDouble + && (inType == kdouble || outType == kdouble)) { - if( gHasDouble ) + if (gHasDouble) { - vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] ); - vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" ); + vlog_error("\t *** %d) convert_%sn%s%s( %sn ) " + "FAILED ** \n", + testNumber, gTypeNames[outType], + gSaturationNames[sat], + gRoundingModeNames[round], + gTypeNames[inType]); + vlog("\t\tcl_khr_fp64 enabled, but double " + "testing turned off.\n"); } continue; } - // Skip the implicit converts if the rounding mode is not default or test is saturated - if( 0 == startMinVectorSize ) + // Skip the implicit converts if the rounding mode is + // not default or test is saturated + if (0 == startMinVectorSize) { - if( sat || round != kDefaultRoundingMode ) + if (sat || round != kDefaultRoundingMode) gMinVectorSize = 1; else gMinVectorSize = 0; } - if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) ) + if ((error = DoTest(device, outType, inType, sat, round, + gMTdata))) { - vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] ); + vlog_error("\t *** %d) convert_%sn%s%s( %sn ) " + "FAILED ** \n", + testNumber, gTypeNames[outType], + gSaturationNames[sat], + gRoundingModeNames[round], + gTypeNames[inType]); } } } @@ -270,17 +315,17 @@ exit: } test_definition test_list[] = { - ADD_TEST( conversions ), + ADD_TEST(conversions), }; -const int test_num = ARRAY_SIZE( test_list ); +const int test_num = ARRAY_SIZE(test_list); #pragma mark - -int main (int argc, const char **argv ) +int main(int argc, const char **argv) { int error; - cl_uint seed = (cl_uint) time( NULL ); + cl_uint seed = (cl_uint)time(NULL); argc = parseCustomParam(argc, argv); if (argc == -1) @@ -288,15 +333,13 @@ int main (int argc, const char **argv ) return 1; } - if( (error = ParseArgs( argc, argv )) ) - return error; + if ((error = ParseArgs(argc, argv))) return error; - //Turn off sleep so our tests run to completion + // Turn off sleep so our tests run to completion PreventSleep(); - atexit( ResumeSleep ); + atexit(ResumeSleep); - if(!gMultithread) - SetThreadCount(1); + if (!gMultithread) SetThreadCount(1); #if defined(_MSC_VER) && defined(_M_IX86) // VS2005 (and probably others, since long double got deprecated) sets @@ -308,14 +351,15 @@ int main (int argc, const char **argv ) _controlfp_s(&ignored, _PC_64, _MCW_PC); #endif - vlog( "===========================================================\n" ); - vlog( "Random seed: %u\n", seed ); - gMTdata = init_genrand( seed ); + vlog("===========================================================\n"); + vlog("Random seed: %u\n", seed); + gMTdata = init_genrand(seed); - const char* arg[] = {argv[0]}; - int ret = runTestHarnessWithCheck( 1, arg, test_num, test_list, true, 0, InitCL ); + const char *arg[] = { argv[0] }; + int ret = + runTestHarnessWithCheck(1, arg, test_num, test_list, true, 0, InitCL); - free_mtdata( gMTdata ); + free_mtdata(gMTdata); if (gQueue) { error = clFinish(gQueue); @@ -324,7 +368,8 @@ int main (int argc, const char **argv ) clReleaseMemObject(gInBuffer); - for( int i = 0; i < kCallStyleCount; i++ ) { + for (int i = 0; i < kCallStyleCount; i++) + { clReleaseMemObject(gOutBuffers[i]); } clReleaseCommandQueue(gQueue); @@ -336,82 +381,67 @@ int main (int argc, const char **argv ) #pragma mark - #pragma mark setup -static int ParseArgs( int argc, const char **argv ) +static int ParseArgs(int argc, const char **argv) { int i; argList = (const char **)calloc(argc, sizeof(char *)); argCount = 0; - if( NULL == argList && argc > 1 ) - return -1; + if (NULL == argList && argc > 1) return -1; -#if (defined( __APPLE__ ) || defined(__linux__) || defined (__MINGW32__)) +#if (defined(__APPLE__) || defined(__linux__) || defined(__MINGW32__)) { // Extract the app name - char baseName[ MAXPATHLEN ]; - strncpy( baseName, argv[0], MAXPATHLEN ); - char *base = basename( baseName ); - if( NULL != base ) + char baseName[MAXPATHLEN]; + strncpy(baseName, argv[0], MAXPATHLEN); + char *base = basename(baseName); + if (NULL != base) { - strncpy( appName, base, sizeof( appName ) ); - appName[ sizeof( appName ) -1 ] = '\0'; + strncpy(appName, base, sizeof(appName)); + appName[sizeof(appName) - 1] = '\0'; } } -#elif defined (_WIN32) +#elif defined(_WIN32) { char fname[_MAX_FNAME + _MAX_EXT + 1]; char ext[_MAX_EXT]; - errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0, - fname, _MAX_FNAME, ext, _MAX_EXT ); - if (err == 0) { // no error - strcat (fname, ext); //just cat them, size of frame can keep both - strncpy (appName, fname, sizeof(appName)); - appName[ sizeof( appName ) -1 ] = '\0'; + errno_t err = _splitpath_s(argv[0], NULL, 0, NULL, 0, fname, _MAX_FNAME, + ext, _MAX_EXT); + if (err == 0) + { // no error + strcat(fname, ext); // just cat them, size of frame can keep both + strncpy(appName, fname, sizeof(appName)); + appName[sizeof(appName) - 1] = '\0'; } } #endif - vlog( "\n%s", appName ); - for( i = 1; i < argc; i++ ) + vlog("\n%s", appName); + for (i = 1; i < argc; i++) { const char *arg = argv[i]; - if( NULL == arg ) - break; + if (NULL == arg) break; - vlog( "\t%s", arg ); - if( arg[0] == '-' ) + vlog("\t%s", arg); + if (arg[0] == '-') { arg++; - while( *arg != '\0' ) + while (*arg != '\0') { - switch( *arg ) + switch (*arg) { - case 'd': - gTestDouble ^= 1; - break; - case 'l': - gSkipTesting ^= 1; - break; - case 'm': - gMultithread ^= 1; - break; - case 'w': - gWimpyMode ^= 1; - break; + case 'd': gTestDouble ^= 1; break; + case 'l': gSkipTesting ^= 1; break; + case 'm': gMultithread ^= 1; break; + case 'w': gWimpyMode ^= 1; break; case '[': parseWimpyReductionFactor(arg, gWimpyReductionFactor); break; - case 'z': - gForceFTZ ^= 1; - break; - case 't': - gTimeResults ^= 1; - break; - case 'a': - gReportAverageTimes ^= 1; - break; + case 'z': gForceFTZ ^= 1; break; + case 't': gTimeResults ^= 1; break; + case 'a': gReportAverageTimes ^= 1; break; case '1': - if( arg[1] == '6' ) + if (arg[1] == '6') { gMinVectorSize = 6; gMaxVectorSize = 7; @@ -445,7 +475,7 @@ static int ParseArgs( int argc, const char **argv ) break; default: - vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg ); + vlog(" <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg); PrintUsage(); return -1; } @@ -455,128 +485,136 @@ static int ParseArgs( int argc, const char **argv ) else { char *t = NULL; - long number = strtol( arg, &t, 0 ); - if( t != arg ) + long number = strtol(arg, &t, 0); + if (t != arg) { - if( gStartTestNumber != -1 ) - gEndTestNumber = gStartTestNumber + (int) number; + if (gStartTestNumber != -1) + gEndTestNumber = gStartTestNumber + (int)number; else - gStartTestNumber = (int) number; + gStartTestNumber = (int)number; } else { - argList[ argCount ] = arg; + argList[argCount] = arg; argCount++; } } } // Check for the wimpy mode environment variable - if (getenv("CL_WIMPY_MODE")) { - vlog( "\n" ); - vlog( "*** Detected CL_WIMPY_MODE env ***\n" ); - gWimpyMode = 1; + if (getenv("CL_WIMPY_MODE")) + { + vlog("\n"); + vlog("*** Detected CL_WIMPY_MODE env ***\n"); + gWimpyMode = 1; } vlog( "\n" ); PrintArch(); - if( gWimpyMode ) + if (gWimpyMode) { - vlog( "\n" ); - vlog( "*** WARNING: Testing in Wimpy mode! ***\n" ); - vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" ); - vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" ); - vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor); + vlog("\n"); + vlog("*** WARNING: Testing in Wimpy mode! ***\n"); + vlog("*** Wimpy mode is not sufficient to verify correctness. ***\n"); + vlog("*** It gives warm fuzzy feelings and then nevers calls. ***\n\n"); + vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", + gWimpyReductionFactor); } return 0; } -static void PrintUsage( void ) +static void PrintUsage(void) { int i; - vlog( "%s [-wz#]: \n", appName ); - vlog( "\ttest names:\n" ); - vlog( "\t\tdestFormat<_sat><_round>_sourceFormat\n" ); - vlog( "\t\t\tPossible format types are:\n\t\t\t\t" ); - for( i = 0; i < kTypeCount; i++ ) - vlog( "%s, ", gTypeNames[i] ); - vlog( "\n\n\t\t\tPossible saturation values are: (empty) and _sat\n" ); - vlog( "\t\t\tPossible rounding values are:\n\t\t\t\t(empty), " ); - for( i = 1; i < kRoundingModeCount; i++ ) - vlog( "%s, ", gRoundingModeNames[i] ); - vlog( "\n\t\t\tExamples:\n" ); - vlog( "\t\t\t\tulong_short converts short to ulong\n" ); - vlog( "\t\t\t\tchar_sat_rte_float converts float to char with saturated clipping in round to nearest rounding mode\n\n" ); - vlog( "\toptions:\n" ); - vlog( "\t\t-d\tToggle testing of double precision. On by default if cl_khr_fp64 is enabled, ignored otherwise.\n" ); - vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" ); - vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" ); - vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" ); - vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor); - vlog( "\t\t-z\tToggle flush to zero mode (Default: per device)\n" ); - vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" ); - vlog( "\n" ); - vlog( "You may also pass the number of the test on which to start.\nA second number can be then passed to indicate how many tests to run\n\n" ); + vlog("%s [-wz#]: \n", appName); + vlog("\ttest names:\n"); + vlog("\t\tdestFormat<_sat><_round>_sourceFormat\n"); + vlog("\t\t\tPossible format types are:\n\t\t\t\t"); + for (i = 0; i < kTypeCount; i++) vlog("%s, ", gTypeNames[i]); + vlog("\n\n\t\t\tPossible saturation values are: (empty) and _sat\n"); + vlog("\t\t\tPossible rounding values are:\n\t\t\t\t(empty), "); + for (i = 1; i < kRoundingModeCount; i++) + vlog("%s, ", gRoundingModeNames[i]); + vlog("\n\t\t\tExamples:\n"); + vlog("\t\t\t\tulong_short converts short to ulong\n"); + vlog("\t\t\t\tchar_sat_rte_float converts float to char with saturated " + "clipping in round to nearest rounding mode\n\n"); + vlog("\toptions:\n"); + vlog("\t\t-d\tToggle testing of double precision. On by default if " + "cl_khr_fp64 is enabled, ignored otherwise.\n"); + vlog("\t\t-l\tToggle link check mode. When on, testing is skipped, and we " + "just check to see that the kernels build. (Off by default.)\n"); + vlog("\t\t-m\tToggle Multithreading. (On by default.)\n"); + vlog("\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very " + "small subset of the tests for each fn. NOT A VALID TEST! (Off by " + "default.)\n"); + vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is " + "1-12, default factor(%u)\n", + gWimpyReductionFactor); + vlog("\t\t-z\tToggle flush to zero mode (Default: per device)\n"); + vlog("\t\t-#\tTest just vector size given by #, where # is an element of " + "the set {1,2,3,4,8,16}\n"); + vlog("\n"); + vlog( + "You may also pass the number of the test on which to start.\nA second " + "number can be then passed to indicate how many tests to run\n\n"); } -static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round ) +static int GetTestCase(const char *name, Type *outType, Type *inType, + SaturationMode *sat, RoundingMode *round) { int i; - //Find the return type - for( i = 0; i < kTypeCount; i++ ) - if( name == strstr( name, gTypeNames[i] ) ) + // Find the return type + for (i = 0; i < kTypeCount; i++) + if (name == strstr(name, gTypeNames[i])) { *outType = (Type)i; - name += strlen( gTypeNames[i] ); + name += strlen(gTypeNames[i]); break; } - if( i == kTypeCount ) - return -1; + if (i == kTypeCount) return -1; // Check to see if _sat appears next *sat = (SaturationMode)0; - for( i = 1; i < kSaturationModeCount; i++ ) - if( name == strstr( name, gSaturationNames[i] ) ) + for (i = 1; i < kSaturationModeCount; i++) + if (name == strstr(name, gSaturationNames[i])) { *sat = (SaturationMode)i; - name += strlen( gSaturationNames[i] ); + name += strlen(gSaturationNames[i]); break; } *round = (RoundingMode)0; - for( i = 1; i < kRoundingModeCount; i++ ) - if( name == strstr( name, gRoundingModeNames[i] ) ) + for (i = 1; i < kRoundingModeCount; i++) + if (name == strstr(name, gRoundingModeNames[i])) { *round = (RoundingMode)i; - name += strlen( gRoundingModeNames[i] ); + name += strlen(gRoundingModeNames[i]); break; } - if( *name != '_' ) - return -2; + if (*name != '_') return -2; name++; - for( i = 0; i < kTypeCount; i++ ) - if( name == strstr( name, gTypeNames[i] ) ) + for (i = 0; i < kTypeCount; i++) + if (name == strstr(name, gTypeNames[i])) { *inType = (Type)i; - name += strlen( gTypeNames[i] ); + name += strlen(gTypeNames[i]); break; } - if( i == kTypeCount ) - return -3; + if (i == kTypeCount) return -3; - if( *name != '\0' ) - return -4; + if (*name != '\0') return -4; return 0; } @@ -584,270 +622,292 @@ static int GetTestCase( const char *name, Type *outType, Type *inType, Saturatio #pragma mark - #pragma mark OpenCL -test_status InitCL( cl_device_id device ) +test_status InitCL(cl_device_id device) { int error, i; - size_t configSize = sizeof( gComputeDevices ); + size_t configSize = sizeof(gComputeDevices); - if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) ) + if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, + configSize, &gComputeDevices, NULL))) gComputeDevices = 1; - configSize = sizeof( gDeviceFrequency ); - if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) ) + configSize = sizeof(gDeviceFrequency); + if ((error = clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, + configSize, &gDeviceFrequency, NULL))) gDeviceFrequency = 0; cl_device_fp_config floatCapabilities = 0; - if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities, NULL))) + if ((error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, + sizeof(floatCapabilities), &floatCapabilities, + NULL))) floatCapabilities = 0; - if(0 == (CL_FP_DENORM & floatCapabilities) ) - gForceFTZ ^= 1; + if (0 == (CL_FP_DENORM & floatCapabilities)) gForceFTZ ^= 1; - if( 0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST ) ) + if (0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST)) { char profileStr[128] = ""; // Verify that we are an embedded profile device - if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( profileStr ), profileStr, NULL ) ) ) + if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, + sizeof(profileStr), profileStr, NULL))) { - vlog_error( "FAILURE: Could not get device profile: error %d\n", error ); + vlog_error("FAILURE: Could not get device profile: error %d\n", + error); return TEST_FAIL; } - if( strcmp( profileStr, "EMBEDDED_PROFILE" ) ) + if (strcmp(profileStr, "EMBEDDED_PROFILE")) { - vlog_error( "FAILURE: non-embedded profile device does not support CL_FP_ROUND_TO_NEAREST\n" ); + vlog_error("FAILURE: non-embedded profile device does not support " + "CL_FP_ROUND_TO_NEAREST\n"); return TEST_FAIL; } - if( 0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO ) ) + if (0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO)) { - vlog_error( "FAILURE: embedded profile device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" ); + vlog_error("FAILURE: embedded profile device supports neither " + "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n"); return TEST_FAIL; } gIsRTZ = 1; } - else if(is_extension_available(device, "cl_khr_fp64")) + else if (is_extension_available(device, "cl_khr_fp64")) { gHasDouble = 1; } gTestDouble &= gHasDouble; - //detect whether profile of the device is embedded + // detect whether profile of the device is embedded char profile[1024] = ""; - if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ) ) ){} - else if( strstr(profile, "EMBEDDED_PROFILE" ) ) + if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), + profile, NULL))) + { + } + else if (strstr(profile, "EMBEDDED_PROFILE")) { gIsEmbedded = 1; - if( !is_extension_available(device, "cles_khr_int64" ) ) - gHasLong = 0; + if (!is_extension_available(device, "cles_khr_int64")) gHasLong = 0; } - gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error ); - if( NULL == gContext || error ) + gContext = clCreateContext(NULL, 1, &device, notify_callback, NULL, &error); + if (NULL == gContext || error) { - vlog_error( "clCreateContext failed. (%d)\n", error ); + vlog_error("clCreateContext failed. (%d)\n", error); return TEST_FAIL; } gQueue = clCreateCommandQueue(gContext, device, 0, &error); - if( NULL == gQueue || error ) + if (NULL == gQueue || error) { - vlog_error( "clCreateCommandQueue failed. (%d)\n", error ); + vlog_error("clCreateCommandQueue failed. (%d)\n", error); return TEST_FAIL; } - //Allocate buffers - //FIXME: use clProtectedArray for guarded allocations? - gIn = malloc( BUFFER_SIZE + 2 * kPageSize ); - gAllowZ = malloc( BUFFER_SIZE + 2 * kPageSize ); - gRef = malloc( BUFFER_SIZE + 2 * kPageSize ); - for( i = 0; i < kCallStyleCount; i++ ) + // Allocate buffers + // FIXME: use clProtectedArray for guarded allocations? + gIn = malloc(BUFFER_SIZE + 2 * kPageSize); + gAllowZ = malloc(BUFFER_SIZE + 2 * kPageSize); + gRef = malloc(BUFFER_SIZE + 2 * kPageSize); + for (i = 0; i < kCallStyleCount; i++) { - gOut[i] = malloc( BUFFER_SIZE + 2 * kPageSize ); - if( NULL == gOut[i] ) - return TEST_FAIL; + gOut[i] = malloc(BUFFER_SIZE + 2 * kPageSize); + if (NULL == gOut[i]) return TEST_FAIL; } // setup input buffers - gInBuffer = clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error); - if( gInBuffer == NULL || error) + gInBuffer = + clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + BUFFER_SIZE, NULL, &error); + if (gInBuffer == NULL || error) { - vlog_error( "clCreateBuffer failed for input (%d)\n", error ); + vlog_error("clCreateBuffer failed for input (%d)\n", error); return TEST_FAIL; } // setup output buffers - for( i = 0; i < kCallStyleCount; i++ ) + for (i = 0; i < kCallStyleCount; i++) { - gOutBuffers[i] = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error ); - if( gOutBuffers[i] == NULL || error ) + gOutBuffers[i] = + clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + BUFFER_SIZE, NULL, &error); + if (gOutBuffers[i] == NULL || error) { - vlog_error( "clCreateArray failed for output (%d)\n", error ); + vlog_error("clCreateArray failed for output (%d)\n", error); return TEST_FAIL; } } - gMTdata = init_genrand( gRandomSeed ); + gMTdata = init_genrand(gRandomSeed); char c[1024]; static const char *no_yes[] = { "NO", "YES" }; - vlog( "\nCompute Device info:\n" ); + vlog("\nCompute Device info:\n"); clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL); - vlog( "\tDevice Name: %s\n", c ); + vlog("\tDevice Name: %s\n", c); clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL); - vlog( "\tVendor: %s\n", c ); + vlog("\tVendor: %s\n", c); clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL); - vlog( "\tDevice Version: %s\n", c ); + vlog("\tDevice Version: %s\n", c); clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL); - vlog( "\tCL C Version: %s\n", c ); + vlog("\tCL C Version: %s\n", c); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL); - vlog( "\tDriver Version: %s\n", c ); - vlog( "\tProcessing with %ld devices\n", gComputeDevices ); - vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency ); - vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] ); - vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ] ); - vlog( "\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ] ); - vlog( "\tHas Double? %s\n", no_yes[0 != gHasDouble] ); - if( gHasDouble ) - vlog( "\tTest Double? %s\n", no_yes[0 != gTestDouble] ); - vlog( "\tHas Long? %s\n", no_yes[0 != gHasLong] ); - vlog( "\tTesting vector sizes: " ); - for( i = gMinVectorSize; i < gMaxVectorSize; i++ ) + vlog("\tDriver Version: %s\n", c); + vlog("\tProcessing with %ld devices\n", gComputeDevices); + vlog("\tDevice Frequency: %d MHz\n", gDeviceFrequency); + vlog("\tSubnormal values supported for floats? %s\n", + no_yes[0 != (CL_FP_DENORM & floatCapabilities)]); + vlog("\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ]); + vlog("\tTesting with default RTZ mode for floats? %s\n", + no_yes[0 != gIsRTZ]); + vlog("\tHas Double? %s\n", no_yes[0 != gHasDouble]); + if (gHasDouble) vlog("\tTest Double? %s\n", no_yes[0 != gTestDouble]); + vlog("\tHas Long? %s\n", no_yes[0 != gHasLong]); + vlog("\tTesting vector sizes: "); + for (i = gMinVectorSize; i < gMaxVectorSize; i++) vlog("\t%d", vectorSizes[i]); - vlog( "\n" ); + vlog("\n"); return TEST_PASS; } -static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount ) +static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, + size_t blockCount) { - // The global dimensions are just the blockCount to execute since we haven't set up multiple queues for multiple devices. + // The global dimensions are just the blockCount to execute since we haven't + // set up multiple queues for multiple devices. int error; - error = clSetKernelArg(kernel, 0, sizeof( inBuf ), &inBuf); + error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf); error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf); - if( error ) + if (error) { - vlog_error( "FAILED -- could not set kernel args (%d)\n", error ); + vlog_error("FAILED -- could not set kernel args (%d)\n", error); return error; } - if( (error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, NULL, 0, NULL, NULL))) + if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, + NULL, 0, NULL, NULL))) { - vlog_error( "FAILED -- could not execute kernel (%d)\n", error ); + vlog_error("FAILED -- could not execute kernel (%d)\n", error); return error; } return 0; } -#if ! defined( __APPLE__ ) -void memset_pattern4(void *dest, const void *src_pattern, size_t bytes ); +#if !defined(__APPLE__) +void memset_pattern4(void *dest, const void *src_pattern, size_t bytes); #endif -#if defined( __APPLE__ ) +#if defined(__APPLE__) #include #endif -uint64_t GetTime( void ); -uint64_t GetTime( void ) +uint64_t GetTime(void); +uint64_t GetTime(void) { -#if defined( __APPLE__ ) +#if defined(__APPLE__) return mach_absolute_time(); #elif defined(_MSC_VER) - return ReadTime(); + return ReadTime(); #else - //mach_absolute_time is a high precision timer with precision < 1 microsecond. + // mach_absolute_time is a high precision timer with precision < 1 + // microsecond. #warning need accurate clock here. Times are invalid. return 0; #endif } -#if defined (_MSC_VER) +#if defined(_MSC_VER) /* function is defined in "compat.h" */ #else -double SubtractTime( uint64_t endTime, uint64_t startTime ); -double SubtractTime( uint64_t endTime, uint64_t startTime ) +double SubtractTime(uint64_t endTime, uint64_t startTime); +double SubtractTime(uint64_t endTime, uint64_t startTime) { uint64_t diff = endTime - startTime; static double conversion = 0.0; - if( 0.0 == conversion ) + if (0.0 == conversion) { -#if defined( __APPLE__ ) - mach_timebase_info_data_t info = {0,0}; - kern_return_t err = mach_timebase_info( &info ); - if( 0 == err ) - conversion = 1e-9 * (double) info.numer / (double) info.denom; +#if defined(__APPLE__) + mach_timebase_info_data_t info = { 0, 0 }; + kern_return_t err = mach_timebase_info(&info); + if (0 == err) + conversion = 1e-9 * (double)info.numer / (double)info.denom; #else - // This function consumes output from GetTime() above, and converts the time to secionds. + // This function consumes output from GetTime() above, and converts the + // time to secionds. #warning need accurate ticks to seconds conversion factor here. Times are invalid. #endif } // strictly speaking we should also be subtracting out timer latency here - return conversion * (double) diff; + return conversion * (double)diff; } #endif typedef struct CalcReferenceValuesInfo { - struct WriteInputBufferInfo *parent; // pointer back to the parent WriteInputBufferInfo struct - cl_kernel kernel; // the kernel for this vector size - cl_program program; // the program for this vector size - cl_uint vectorSize; // the vector size for this callback chain - void *p; // the pointer to mapped result data for this vector size - cl_int result; -}CalcReferenceValuesInfo; + struct WriteInputBufferInfo + *parent; // pointer back to the parent WriteInputBufferInfo struct + cl_kernel kernel; // the kernel for this vector size + cl_program program; // the program for this vector size + cl_uint vectorSize; // the vector size for this callback chain + void *p; // the pointer to mapped result data for this vector size + cl_int result; +} CalcReferenceValuesInfo; typedef struct WriteInputBufferInfo { - volatile cl_event calcReferenceValues; // user event which signals when main thread is done calculating reference values - volatile cl_event doneBarrier; // user event which signals when worker threads are done - cl_uint count; // the number of elements in the array - Type outType; // the data type of the conversion result - Type inType; // the data type of the conversion input - volatile int barrierCount; - CalcReferenceValuesInfo calcInfo[kCallStyleCount]; -}WriteInputBufferInfo; + volatile cl_event + calcReferenceValues; // user event which signals when main thread is + // done calculating reference values + volatile cl_event + doneBarrier; // user event which signals when worker threads are done + cl_uint count; // the number of elements in the array + Type outType; // the data type of the conversion result + Type inType; // the data type of the conversion input + volatile int barrierCount; + CalcReferenceValuesInfo calcInfo[kCallStyleCount]; +} WriteInputBufferInfo; -cl_uint RoundUpToNextPowerOfTwo( cl_uint x ); -cl_uint RoundUpToNextPowerOfTwo( cl_uint x ) +cl_uint RoundUpToNextPowerOfTwo(cl_uint x); +cl_uint RoundUpToNextPowerOfTwo(cl_uint x) { - if( 0 == (x & (x-1))) - return x; + if (0 == (x & (x - 1))) return x; - while( x & (x-1) ) - x &= x-1; + while (x & (x - 1)) x &= x - 1; return x + x; } -void CL_CALLBACK WriteInputBufferComplete( cl_event, cl_int, void * ); +void WriteInputBufferComplete(void *); typedef struct DataInitInfo { - cl_ulong start; - cl_uint size; - Type outType; - Type inType; - SaturationMode sat; - RoundingMode round; - MTdata *d; -}DataInitInfo; + cl_ulong start; + cl_uint size; + Type outType; + Type inType; + SaturationMode sat; + RoundingMode round; + MTdata *d; +} DataInitInfo; -cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p ); -cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p ) +cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p); +cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p) { - DataInitInfo *info = (DataInitInfo*) p; + DataInitInfo *info = (DataInitInfo *)p; - gInitFunctions[ info->inType ]( (char*)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, info->round, - info->outType, info->start + job_id * info->size, info->size, info->d[thread_id] ); + gInitFunctions[info->inType]( + (char *)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, + info->round, info->outType, info->start + job_id * info->size, + info->size, info->d[thread_id]); return CL_SUCCESS; } @@ -855,13 +915,13 @@ static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count) { cl_uint i; for (i = 0; i < count; ++i) - allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0); + allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0); } -cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ); -cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ) +cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p); +cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p) { - DataInitInfo *info = (DataInitInfo*) p; + DataInitInfo *info = (DataInitInfo *)p; cl_uint count = info->size; Type inType = info->inType; Type outType = info->outType; @@ -870,16 +930,15 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ) Force64BitFPUPrecision(); - void *s = (cl_uchar*) gIn + job_id * count * gTypeSizes[info->inType]; - void *a = (cl_uchar*) gAllowZ + job_id * count; - void *d = (cl_uchar*) gRef + job_id * count * gTypeSizes[info->outType]; + void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType]; + void *a = (cl_uchar *)gAllowZ + job_id * count; + void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType]; if (outType != inType) { - //create the reference while we wait - Convert f = gConversions[ outType ][ inType ]; - if( info->sat ) - f = gSaturatedConversions[ outType ][ inType ]; + // create the reference while we wait + Convert f = gConversions[outType][inType]; + if (info->sat) f = gSaturatedConversions[outType][inType]; #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__) /* ARM VFP doesn't have hardware instruction for converting from 64-bit @@ -896,42 +955,34 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ) switch (round) { /* conversions to floating-point type use the current rounding mode. - * The only default floating-point rounding mode supported is round to nearest even - * i.e the current rounding mode will be _rte for floating-point types. */ - case kDefaultRoundingMode: - qcom_rm = qcomRTE; - break; - case kRoundToNearestEven: - qcom_rm = qcomRTE; - break; - case kRoundUp: - qcom_rm = qcomRTP; - break; - case kRoundDown: - qcom_rm = qcomRTN; - break; - case kRoundTowardZero: - qcom_rm = qcomRTZ; - break; + * The only default floating-point rounding mode supported is round + * to nearest even i.e the current rounding mode will be _rte for + * floating-point types. */ + case kDefaultRoundingMode: qcom_rm = qcomRTE; break; + case kRoundToNearestEven: qcom_rm = qcomRTE; break; + case kRoundUp: qcom_rm = qcomRTP; break; + case kRoundDown: qcom_rm = qcomRTN; break; + case kRoundTowardZero: qcom_rm = qcomRTZ; break; default: - vlog_error("ERROR: undefined rounding mode %d\n", round); - break; + vlog_error("ERROR: undefined rounding mode %d\n", round); + break; } - qcom_sat = info->sat; + qcom_sat = info->sat; #endif - RoundingMode oldRound = set_round( round, outType ); - f( d, s, count ); - set_round( oldRound, outType ); + RoundingMode oldRound = set_round(round, outType); + f(d, s, count); + set_round(oldRound, outType); - // Decide if we allow a zero result in addition to the correctly rounded one + // Decide if we allow a zero result in addition to the correctly rounded + // one memset(a, 0, count); - if (gForceFTZ) { - if (inType == kfloat) - setAllowZ((uint8_t*)a, (uint32_t*)s, count); - if (outType == kfloat) - setAllowZ((uint8_t*)a, (uint32_t*)d, count); - } + if (gForceFTZ) + { + if (inType == kfloat) setAllowZ((uint8_t *)a, (uint32_t *)s, count); + if (outType == kfloat) + setAllowZ((uint8_t *)a, (uint32_t *)d, count); + } } else { @@ -939,46 +990,48 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ) memcpy(d, s, info->size * gTypeSizes[inType]); } - //Patch up NaNs conversions to integer to zero -- these can be converted to any integer - if( info->outType != kfloat && info->outType != kdouble ) + // Patch up NaNs conversions to integer to zero -- these can be converted to + // any integer + if (info->outType != kfloat && info->outType != kdouble) { - if( inType == kfloat ) + if (inType == kfloat) { - float *inp = (float*) s; - for( j = 0; j < count; j++ ) + float *inp = (float *)s; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) ) - memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); + if (isnan(inp[j])) + memset((char *)d + j * gTypeSizes[outType], 0, + gTypeSizes[outType]); } } - if( inType == kdouble ) + if (inType == kdouble) { - double *inp = (double*) s; - for( j = 0; j < count; j++ ) + double *inp = (double *)s; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) ) - memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); + if (isnan(inp[j])) + memset((char *)d + j * gTypeSizes[outType], 0, + gTypeSizes[outType]); } } } - else if( inType == kfloat || inType == kdouble ) - { // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN - if( inType == kfloat && outType == kdouble ) + else if (inType == kfloat || inType == kdouble) + { // outtype and intype is float or double. NaN conversions for float <-> + // double can be any NaN + if (inType == kfloat && outType == kdouble) { - float *inp = (float*) s; - for( j = 0; j < count; j++ ) + float *inp = (float *)s; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) ) - ((double*) d)[j] = NAN; + if (isnan(inp[j])) ((double *)d)[j] = NAN; } } - if( inType == kdouble && outType == kfloat ) + if (inType == kdouble && outType == kfloat) { - double *inp = (double*) s; - for( j = 0; j < count; j++ ) + double *inp = (double *)s; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) ) - ((float*) d)[j] = NAN; + if (isnan(inp[j])) ((float *)d)[j] = NAN; } } } @@ -986,13 +1039,14 @@ cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p ) return CL_SUCCESS; } -static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d ) +static int DoTest(cl_device_id device, Type outType, Type inType, + SaturationMode sat, RoundingMode round, MTdata d) { #ifdef __APPLE__ cl_ulong wall_start = mach_absolute_time(); #endif - DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL }; + DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL }; WriteInputBufferInfo writeInputBufferInfo; int vectorSize; int error = 0; @@ -1003,22 +1057,23 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod size_t blockCount = BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]); size_t step = blockCount; - uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]); - cl_event writeInputBuffer = NULL; + uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]); - memset( &writeInputBufferInfo, 0, sizeof( writeInputBufferInfo ) ); - init_info.d = (MTdata*)malloc( threads * sizeof( MTdata ) ); - if( NULL == init_info.d ) + memset(&writeInputBufferInfo, 0, sizeof(writeInputBufferInfo)); + init_info.d = (MTdata *)malloc(threads * sizeof(MTdata)); + if (NULL == init_info.d) { - vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" ); + vlog_error( + "ERROR: Unable to allocate storage for random number generator!\n"); return -1; } - for( i = 0; i < threads; i++ ) + for (i = 0; i < threads; i++) { - init_info.d[i] = init_genrand( genrand_int32( d ) ); - if( NULL == init_info.d[i] ) + init_info.d[i] = init_genrand(genrand_int32(d)); + if (NULL == init_info.d[i]) { - vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" ); + vlog_error("ERROR: Unable to allocate storage for random number " + "generator!\n"); return -1; } } @@ -1026,52 +1081,53 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod writeInputBufferInfo.outType = outType; writeInputBufferInfo.inType = inType; - for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) + for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) { - writeInputBufferInfo.calcInfo[vectorSize].program = MakeProgram( outType, inType, sat, round, vectorSize, - &writeInputBufferInfo.calcInfo[vectorSize].kernel ); - if( NULL == writeInputBufferInfo.calcInfo[vectorSize].program ) + writeInputBufferInfo.calcInfo[vectorSize].program = + MakeProgram(outType, inType, sat, round, vectorSize, + &writeInputBufferInfo.calcInfo[vectorSize].kernel); + if (NULL == writeInputBufferInfo.calcInfo[vectorSize].program) { gFailCount++; return -1; } - if( NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel ) + if (NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create kernel.\n" ); + vlog_error("\t\tFAILED -- Failed to create kernel.\n"); return -2; } - writeInputBufferInfo.calcInfo[vectorSize].parent = &writeInputBufferInfo; + writeInputBufferInfo.calcInfo[vectorSize].parent = + &writeInputBufferInfo; writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize; writeInputBufferInfo.calcInfo[vectorSize].result = -1; } - if( gSkipTesting ) - goto exit; + if (gSkipTesting) goto exit; // Patch up rounding mode if default is RTZ - // We leave the part above in default rounding mode so that the right kernel is compiled. - if( round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat) ) + // We leave the part above in default rounding mode so that the right kernel + // is compiled. + if (round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat)) init_info.round = round = kRoundTowardZero; // Figure out how many elements are in a work block // we handle 64-bit types a bit differently. - if( 8*gTypeSizes[ inType ] > 32 ) - lastCase = 0x100000000ULL; + if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL; - if ( !gWimpyMode && gIsEmbedded ) - step = blockCount * EMBEDDED_REDUCTION_FACTOR; + if (!gWimpyMode && gIsEmbedded) + step = blockCount * EMBEDDED_REDUCTION_FACTOR; - if ( gWimpyMode ) - step = (size_t)blockCount * (size_t)gWimpyReductionFactor; - vlog( "Testing... " ); + if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor; + vlog("Testing... "); fflush(stdout); - for( i = 0; i < (uint64_t)lastCase; i += step ) + for (i = 0; i < (uint64_t)lastCase; i += step) { - if( 0 == ( i & ((lastCase >> 3) -1))) { + if (0 == (i & ((lastCase >> 3) - 1))) + { vlog("."); fflush(stdout); } @@ -1079,53 +1135,61 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i); writeInputBufferInfo.count = count; - // Crate a user event to represent the status of the reference value computation completion - writeInputBufferInfo.calcReferenceValues = clCreateUserEvent( gContext, &error); - if( error || NULL == writeInputBufferInfo.calcReferenceValues ) + // Crate a user event to represent the status of the reference value + // computation completion + writeInputBufferInfo.calcReferenceValues = + clCreateUserEvent(gContext, &error); + if (error || NULL == writeInputBufferInfo.calcReferenceValues) { - vlog_error( "ERROR: Unable to create user event. (%d)\n", error ); + vlog_error("ERROR: Unable to create user event. (%d)\n", error); gFailCount++; goto exit; } // retain for consumption by MapOutputBufferComplete - for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) + for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; + vectorSize++) { - if( (error = clRetainEvent(writeInputBufferInfo.calcReferenceValues) )) + if ((error = + clRetainEvent(writeInputBufferInfo.calcReferenceValues))) { - vlog_error( "ERROR: Unable to retain user event. (%d)\n", error ); + vlog_error("ERROR: Unable to retain user event. (%d)\n", error); gFailCount++; goto exit; } } - // Crate a user event to represent when the callbacks are done verifying correctness - writeInputBufferInfo.doneBarrier = clCreateUserEvent( gContext, &error); - if( error || NULL == writeInputBufferInfo.calcReferenceValues ) + // Crate a user event to represent when the callbacks are done verifying + // correctness + writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error); + if (error || NULL == writeInputBufferInfo.calcReferenceValues) { - vlog_error( "ERROR: Unable to create user event for barrier. (%d)\n", error ); + vlog_error("ERROR: Unable to create user event for barrier. (%d)\n", + error); gFailCount++; goto exit; } // retain for use by the callback that calls this - if( (error = clRetainEvent(writeInputBufferInfo.doneBarrier) )) + if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier))) { - vlog_error( "ERROR: Unable to retain user event doneBarrier. (%d)\n", error ); + vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n", + error); gFailCount++; goto exit; } // Call this in a multithreaded manner - // gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d ); + // gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d + // ); cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2; init_info.start = i; init_info.size = count / chunks; - if( init_info.size < 16384 ) + if (init_info.size < 16384) { chunks = RoundUpToNextPowerOfTwo(threads); init_info.size = count / chunks; - if( init_info.size < 16384 ) + if (init_info.size < 16384) { init_info.size = count; chunks = 1; @@ -1134,43 +1198,23 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod ThreadPool_Do(InitData, chunks, &init_info); // Copy the results to the device - writeInputBuffer = NULL; - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, count * gTypeSizes[inType], gIn, 0, NULL, &writeInputBuffer ))) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, + count * gTypeSizes[inType], gIn, 0, + NULL, NULL))) { - vlog_error( "ERROR: clEnqueueWriteBuffer failed. (%d)\n", error ); + vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error); gFailCount++; goto exit; } - // Setup completion callback for the write, which will enqueue the rest of the work - // This is somewhat gratuitous. Because this is an in order queue, we didn't really need to - // do this work in a callback. We could have done it from the main thread. Here we are - // verifying that the implementation can enqueue work from a callback, while at the same time - // also checking to make sure that the conversions work. - // - // Because the verification code is also moved to a callback, it is hoped that implementations will - // achieve a test performance improvement because they can verify the results in parallel. If the - // implementation serializes callbacks however, that won't happen. Consider it some motivation - // to do the right thing! :-) - if( (error = clSetEventCallback( writeInputBuffer, CL_COMPLETE, WriteInputBufferComplete, &writeInputBufferInfo)) ) - { - vlog_error( "ERROR: clSetEventCallback failed. (%d)\n", error ); - gFailCount++; - goto exit; - } - - // The event can't be destroyed until the callback is called, so we can release it now. - if( (error = clReleaseEvent(writeInputBuffer) )) - { - vlog_error( "ERROR: clReleaseEvent failed. (%d)\n", error ); - gFailCount++; - goto exit; - } + // Call completion callback for the write, which will enqueue the rest + // of the work. + WriteInputBufferComplete((void *)&writeInputBufferInfo); // Make sure the work is actually running, so we don't deadlock - if( (error = clFlush( gQueue ) ) ) + if ((error = clFlush(gQueue))) { - vlog_error( "clFlush failed with error %d\n", error ); + vlog_error("clFlush failed with error %d\n", error); gFailCount++; goto exit; } @@ -1178,77 +1222,91 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod ThreadPool_Do(PrepareReference, chunks, &init_info); // signal we are done calculating the reference results - if( (error = clSetUserEventStatus( writeInputBufferInfo.calcReferenceValues, CL_COMPLETE ) ) ) + if ((error = clSetUserEventStatus( + writeInputBufferInfo.calcReferenceValues, CL_COMPLETE))) { - vlog_error( "Error: Failed to set user event status to CL_COMPLETE: %d\n", error ); + vlog_error( + "Error: Failed to set user event status to CL_COMPLETE: %d\n", + error); gFailCount++; goto exit; } // Wait for the event callbacks to finish verifying correctness. - if( (error = clWaitForEvents( 1, (cl_event*) &writeInputBufferInfo.doneBarrier ) )) + if ((error = clWaitForEvents( + 1, (cl_event *)&writeInputBufferInfo.doneBarrier))) { - vlog_error( "Error: Failed to wait for barrier: %d\n", error ); + vlog_error("Error: Failed to wait for barrier: %d\n", error); gFailCount++; goto exit; } - if( (error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues ) )) + if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues))) { - vlog_error( "Error: Failed to release calcReferenceValues: %d\n", error ); + vlog_error("Error: Failed to release calcReferenceValues: %d\n", + error); gFailCount++; goto exit; } - if( (error = clReleaseEvent(writeInputBufferInfo.doneBarrier ) )) + if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier))) { - vlog_error( "Error: Failed to release done barrier: %d\n", error ); + vlog_error("Error: Failed to release done barrier: %d\n", error); gFailCount++; goto exit; } - for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) + for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; + vectorSize++) { - if( ( error = writeInputBufferInfo.calcInfo[ vectorSize ].result )) + if ((error = writeInputBufferInfo.calcInfo[vectorSize].result)) { - switch( inType ) + switch (inType) { case kuchar: case kchar: - vlog( "Input value: 0x%2.2x ", ((unsigned char*)gIn)[error - 1] ); + vlog("Input value: 0x%2.2x ", + ((unsigned char *)gIn)[error - 1]); break; case kushort: case kshort: - vlog( "Input value: 0x%4.4x ", ((unsigned short*)gIn)[error - 1] ); + vlog("Input value: 0x%4.4x ", + ((unsigned short *)gIn)[error - 1]); break; case kuint: case kint: - vlog( "Input value: 0x%8.8x ", ((unsigned int*)gIn)[error - 1] ); + vlog("Input value: 0x%8.8x ", + ((unsigned int *)gIn)[error - 1]); break; case kfloat: - vlog( "Input value: %a ", ((float*)gIn)[error - 1] ); + vlog("Input value: %a ", ((float *)gIn)[error - 1]); break; break; case kulong: case klong: - vlog( "Input value: 0x%16.16llx ", ((unsigned long long*)gIn)[error - 1] ); + vlog("Input value: 0x%16.16llx ", + ((unsigned long long *)gIn)[error - 1]); break; case kdouble: - vlog( "Input value: %a ", ((double*)gIn)[error - 1]); + vlog("Input value: %a ", ((double *)gIn)[error - 1]); break; default: - vlog_error( "Internal error at %s: %d\n", __FILE__, __LINE__ ); + vlog_error("Internal error at %s: %d\n", __FILE__, + __LINE__); abort(); break; } // tell the user which conversion it was. - if( 0 == vectorSize ) - vlog( " (implicit scalar conversion from %s to %s)\n", gTypeNames[ inType ], gTypeNames[ outType ] ); + if (0 == vectorSize) + vlog(" (implicit scalar conversion from %s to %s)\n", + gTypeNames[inType], gTypeNames[outType]); else - vlog( " (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], sizeNames[vectorSize], gSaturationNames[ sat ], - gRoundingModeNames[ round ], gTypeNames[inType], sizeNames[vectorSize] ); + vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], + sizeNames[vectorSize], gSaturationNames[sat], + gRoundingModeNames[round], gTypeNames[inType], + sizeNames[vectorSize]); gFailCount++; goto exit; @@ -1256,300 +1314,318 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod } } - log_info( "done.\n" ); + log_info("done.\n"); - if( gTimeResults ) + if (gTimeResults) { - //Kick off tests for the various vector lengths - for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) + // Kick off tests for the various vector lengths + for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; + vectorSize++) { size_t workItemCount = blockCount / vectorSizes[vectorSize]; - if( vectorSizes[vectorSize] * gTypeSizes[outType] < 4 ) - workItemCount /= 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]); + if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4) + workItemCount /= + 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]); double sum = 0.0; double bestTime = INFINITY; cl_uint k; - for( k = 0; k < PERF_LOOP_COUNT; k++ ) + for (k = 0; k < PERF_LOOP_COUNT; k++) { uint64_t startTime = GetTime(); - if( (error = RunKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) ) + if ((error = RunKernel( + writeInputBufferInfo.calcInfo[vectorSize].kernel, + gInBuffer, gOutBuffers[vectorSize], workItemCount))) { gFailCount++; goto exit; } // Make sure OpenCL is done - if( (error = clFinish(gQueue) ) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Error %d at clFinish\n", error ); + vlog_error("Error %d at clFinish\n", error); goto exit; } uint64_t endTime = GetTime(); - double time = SubtractTime( endTime, startTime ); + double time = SubtractTime(endTime, startTime); sum += time; - if( time < bestTime ) - bestTime = time; - + if (time < bestTime) bestTime = time; } - if( gReportAverageTimes ) - bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (workItemCount * vectorSizes[vectorSize]); - if( 0 == vectorSize ) - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "implicit convert %s -> %s", gTypeNames[ inType ], gTypeNames[ outType ] ); + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (workItemCount * vectorSizes[vectorSize]); + if (0 == vectorSize) + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", + "implicit convert %s -> %s", gTypeNames[inType], + gTypeNames[outType]); else - vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "convert_%s%s%s%s( %s%s )", gTypeNames[ outType ], sizeNames[vectorSize], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType], sizeNames[vectorSize] ); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", + "convert_%s%s%s%s( %s%s )", gTypeNames[outType], + sizeNames[vectorSize], gSaturationNames[sat], + gRoundingModeNames[round], gTypeNames[inType], + sizeNames[vectorSize]); } } - if( gWimpyMode ) - vlog( "\tWimp pass" ); + if (gWimpyMode) + vlog("\tWimp pass"); else - vlog( "\tpassed" ); + vlog("\tpassed"); #ifdef __APPLE__ // record the run time - vlog( "\t(%f s)", 1e-9 * ( mach_absolute_time() - wall_start ) ); + vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start)); #endif - vlog( "\n\n" ); - fflush( stdout ); + vlog("\n\n"); + fflush(stdout); exit: - //clean up - for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) + // clean up + for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) { - clReleaseProgram( writeInputBufferInfo.calcInfo[vectorSize].program ); - clReleaseKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel ); + clReleaseProgram(writeInputBufferInfo.calcInfo[vectorSize].program); + clReleaseKernel(writeInputBufferInfo.calcInfo[vectorSize].kernel); } - if( init_info.d ) + if (init_info.d) { - for( i = 0; i < threads; i++ ) - free_mtdata(init_info.d[i]); + for (i = 0; i < threads; i++) free_mtdata(init_info.d[i]); free(init_info.d); } return error; } -void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data ); +void MapResultValuesComplete(void *data); // Note: not called reentrantly -void CL_CALLBACK WriteInputBufferComplete( cl_event e, cl_int status, void *data ) +void WriteInputBufferComplete(void *data) { - WriteInputBufferInfo *info = (WriteInputBufferInfo*) data; + cl_int status; + WriteInputBufferInfo *info = (WriteInputBufferInfo *)data; cl_uint count = info->count; int vectorSize; - if( CL_SUCCESS != status ) - { - vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); - gFailCount++; - return; - } - info->barrierCount = gMaxVectorSize - gMinVectorSize; - // now that we know that the write buffer is complete, enqueue callbacks to wait for the main thread to - // finish calculating the reference results. - for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) + // now that we know that the write buffer is complete, enqueue callbacks to + // wait for the main thread to finish calculating the reference results. + for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) { - size_t workItemCount = (count + vectorSizes[vectorSize] - 1) / ( vectorSizes[vectorSize]); - cl_event mapComplete = NULL; + size_t workItemCount = + (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]); - if( (status = RunKernel( info->calcInfo[ vectorSize ].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) ) + if ((status = RunKernel(info->calcInfo[vectorSize].kernel, gInBuffer, + gOutBuffers[vectorSize], workItemCount))) { gFailCount++; return; } - info->calcInfo[vectorSize].p = clEnqueueMapBuffer( gQueue, gOutBuffers[ vectorSize ], CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, - 0, count * gTypeSizes[ info->outType ], 0, NULL, &mapComplete, &status); + info->calcInfo[vectorSize].p = clEnqueueMapBuffer( + gQueue, gOutBuffers[vectorSize], CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0, + NULL, NULL, &status); { - if( status ) + if (status) { - vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); + vlog_error("ERROR: WriteInputBufferComplete calback failed " + "with status: %d\n", + status); gFailCount++; return; } } + } - if( (status = clSetEventCallback( mapComplete, CL_COMPLETE, MapResultValuesComplete, info->calcInfo + vectorSize))) - { - vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); - gFailCount++; - return; - } - - if( (status = clReleaseEvent(mapComplete))) - { - vlog_error( "ERROR: clReleaseEvent calback failed in WriteInputBufferComplete for vector size %d with status: %d\n", vectorSize, status ); - gFailCount++; - return; - } + for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) + { + MapResultValuesComplete(info->calcInfo + vectorSize); } // Make sure the work starts moving -- otherwise we may deadlock - if( (status = clFlush(gQueue))) + if ((status = clFlush(gQueue))) { - vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status ); + vlog_error( + "ERROR: WriteInputBufferComplete calback failed with status: %d\n", + status); gFailCount++; return; } - // e was already released by the main thread. It should be destroyed automatically soon after we exit. + // e was already released by the main thread. It should be destroyed + // automatically soon after we exit. } -void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data ); +void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status, + void *data); // Note: May be called reentrantly -void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data ) +void MapResultValuesComplete(void *data) { - CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data; + cl_int status; + CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data; cl_event calcReferenceValues = info->parent->calcReferenceValues; - if( CL_SUCCESS != status ) + // we know that the map is done, wait for the main thread to finish + // calculating the reference values + if ((status = clSetEventCallback(calcReferenceValues, CL_COMPLETE, + CalcReferenceValuesComplete, data))) { - vlog_error( "ERROR: MapResultValuesComplete calback failed with status: %d\n", status ); - gFailCount++; // not thread safe -- being lazy here - clReleaseEvent(calcReferenceValues); - return; + vlog_error("ERROR: clSetEventCallback failed in " + "MapResultValuesComplete with status: %d\n", + status); + gFailCount++; // not thread safe -- being lazy here } - // we know that the map is done, wait for the main thread to finish calculating the reference values - if( (status = clSetEventCallback( calcReferenceValues, CL_COMPLETE, CalcReferenceValuesComplete, data ))) + // this thread no longer needs its reference to info->calcReferenceValues, + // so release it + if ((status = clReleaseEvent(calcReferenceValues))) { - vlog_error( "ERROR: clSetEventCallback failed in MapResultValuesComplete with status: %d\n", status ); - gFailCount++; // not thread safe -- being lazy here - } - - // this thread no longer needs its reference to info->calcReferenceValues, so release it - if( (status = clReleaseEvent(calcReferenceValues) )) - { - vlog_error( "ERROR: clReleaseEvent(info->calcReferenceValues) failed with status: %d\n", status ); - gFailCount++; // not thread safe -- being lazy here + vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed " + "with status: %d\n", + status); + gFailCount++; // not thread safe -- being lazy here } // no need to flush since we didn't enqueue anything - // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after we exit. + // e was already released by WriteInputBufferComplete. It should be + // destroyed automatically soon after we exit. } -void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data ) +void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status, + void *data) { - CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data; - cl_uint vectorSize = info->vectorSize; - cl_uint count = info->parent->count; - Type outType = info->parent->outType; // the data type of the conversion result - Type inType = info->parent->inType; // the data type of the conversion input - size_t j; - cl_int error; - cl_event doneBarrier = info->parent->doneBarrier; + CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data; + cl_uint vectorSize = info->vectorSize; + cl_uint count = info->parent->count; + Type outType = + info->parent->outType; // the data type of the conversion result + Type inType = info->parent->inType; // the data type of the conversion input + size_t j; + cl_int error; + cl_event doneBarrier = info->parent->doneBarrier; // report spurious error condition - if( CL_SUCCESS != status ) + if (CL_SUCCESS != status) { - vlog_error( "ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", status ); - gFailCount++; // lazy about thread safety here + vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", + status); + gFailCount++; // lazy about thread safety here return; } - // Now we know that both results have been mapped back from the device, and the - // main thread is done calculating the reference results. It is now time to check - // the results. + // Now we know that both results have been mapped back from the device, and + // the main thread is done calculating the reference results. It is now time + // to check the results. // verify results void *mapped = info->p; - //Patch up NaNs conversions to integer to zero -- these can be converted to any integer - if( outType != kfloat && outType != kdouble ) + // Patch up NaNs conversions to integer to zero -- these can be converted to + // any integer + if (outType != kfloat && outType != kdouble) { - if( inType == kfloat ) + if (inType == kfloat) { - float *inp = (float*) gIn; - for( j = 0; j < count; j++ ) + float *inp = (float *)gIn; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) ) - memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); + if (isnan(inp[j])) + memset((char *)mapped + j * gTypeSizes[outType], 0, + gTypeSizes[outType]); } } - if( inType == kdouble ) + if (inType == kdouble) { - double *inp = (double*) gIn; - for( j = 0; j < count; j++ ) + double *inp = (double *)gIn; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) ) - memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] ); + if (isnan(inp[j])) + memset((char *)mapped + j * gTypeSizes[outType], 0, + gTypeSizes[outType]); } } } - else if( inType == kfloat || inType == kdouble ) - { // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN - if( inType == kfloat && outType == kdouble ) + else if (inType == kfloat || inType == kdouble) + { // outtype and intype is float or double. NaN conversions for float <-> + // double can be any NaN + if (inType == kfloat && outType == kdouble) { - float *inp = (float*) gIn; - double *outp = (double*) mapped; - for( j = 0; j < count; j++ ) + float *inp = (float *)gIn; + double *outp = (double *)mapped; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) && isnan(outp[j]) ) - outp[j] = NAN; + if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN; } } - if( inType == kdouble && outType == kfloat ) + if (inType == kdouble && outType == kfloat) { - double *inp = (double*) gIn; - float *outp = (float*) mapped; - for( j = 0; j < count; j++ ) + double *inp = (double *)gIn; + float *outp = (float *)mapped; + for (j = 0; j < count; j++) { - if( isnan( inp[j] ) && isnan(outp[j]) ) - outp[j] = NAN; + if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN; } } } - if( memcmp( mapped, gRef, count * gTypeSizes[ outType ] ) ) - info->result = gCheckResults[outType]( mapped, gRef, gAllowZ, count, vectorSizes[vectorSize] ); + if (memcmp(mapped, gRef, count * gTypeSizes[outType])) + info->result = gCheckResults[outType](mapped, gRef, gAllowZ, count, + vectorSizes[vectorSize]); else info->result = 0; // Fill the output buffer with junk and release it { - cl_uint pattern = 0xffffdead; + cl_uint pattern = 0xffffdead; memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]); - if((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[ vectorSize ], mapped, 0, NULL, NULL))) + if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize], + mapped, 0, NULL, NULL))) { - vlog_error( "ERROR: clEnqueueUnmapMemObject failed in CalcReferenceValuesComplete (%d)\n", error ); + vlog_error("ERROR: clEnqueueUnmapMemObject failed in " + "CalcReferenceValuesComplete (%d)\n", + error); gFailCount++; } } - if( 1 == ThreadPool_AtomicAdd( &info->parent->barrierCount, -1) ) + if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1)) { - if( (status = clSetUserEventStatus( doneBarrier, CL_COMPLETE) )) + if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE))) { - vlog_error( "ERROR: clSetUserEventStatus failed in CalcReferenceValuesComplete (err: %d). We're probably going to deadlock.\n", status ); + vlog_error("ERROR: clSetUserEventStatus failed in " + "CalcReferenceValuesComplete (err: %d). We're probably " + "going to deadlock.\n", + status); gFailCount++; return; } - if( (status = clReleaseEvent( doneBarrier ) ) ) + if ((status = clReleaseEvent(doneBarrier))) { - vlog_error( "ERROR: clReleaseEvent failed in CalcReferenceValuesComplete (err: %d).\n", status ); + vlog_error("ERROR: clReleaseEvent failed in " + "CalcReferenceValuesComplete (err: %d).\n", + status); gFailCount++; return; } } - - - // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after - // all the calls to CalcReferenceValuesComplete exit. + // e was already released by WriteInputBufferComplete. It should be + // destroyed automatically soon after all the calls to + // CalcReferenceValuesComplete exit. } -static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel ) +static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat, + RoundingMode round, int vectorSize, + cl_kernel *outKernel) { cl_program program; char testName[256]; @@ -1559,7 +1635,8 @@ static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, if (outType == kdouble || inType == kdouble) source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; - // Create the program. This is a bit complicated because we are trying to avoid byte and short stores. + // Create the program. This is a bit complicated because we are trying to + // avoid byte and short stores. if (0 == vectorSize) { // Create the type names. @@ -1590,27 +1667,35 @@ static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, char outName[32]; switch (vectorSizetmp) { - case 1: - strncpy(inName, gTypeNames[inType], sizeof(inName)); - strncpy(outName, gTypeNames[outType], sizeof(outName)); - snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]); - snprintf(testName, 256, "test_%s_%s", convertString, inName); - vlog("Building %s( %s ) test\n", convertString, inName); - break; - case 3: - strncpy(inName, gTypeNames[inType], sizeof(inName)); - strncpy(outName, gTypeNames[outType], sizeof(outName)); - snprintf(convertString, sizeof(convertString), "convert_%s3%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]); - snprintf(testName, 256, "test_%s_%s3", convertString, inName); - vlog("Building %s( %s3 ) test\n", convertString, inName); - break; - default: - snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType], vectorSizetmp); - snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType], vectorSizetmp); - snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]); - snprintf(testName, 256, "test_%s_%s", convertString, inName); - vlog("Building %s( %s ) test\n", convertString, inName); - break; + case 1: + strncpy(inName, gTypeNames[inType], sizeof(inName)); + strncpy(outName, gTypeNames[outType], sizeof(outName)); + snprintf(convertString, sizeof(convertString), "convert_%s%s%s", + outName, gSaturationNames[sat], + gRoundingModeNames[round]); + snprintf(testName, 256, "test_%s_%s", convertString, inName); + vlog("Building %s( %s ) test\n", convertString, inName); + break; + case 3: + strncpy(inName, gTypeNames[inType], sizeof(inName)); + strncpy(outName, gTypeNames[outType], sizeof(outName)); + snprintf(convertString, sizeof(convertString), + "convert_%s3%s%s", outName, gSaturationNames[sat], + gRoundingModeNames[round]); + snprintf(testName, 256, "test_%s_%s3", convertString, inName); + vlog("Building %s( %s3 ) test\n", convertString, inName); + break; + default: + snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType], + vectorSizetmp); + snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType], + vectorSizetmp); + snprintf(convertString, sizeof(convertString), "convert_%s%s%s", + outName, gSaturationNames[sat], + gRoundingModeNames[round]); + snprintf(testName, 256, "test_%s_%s", convertString, inName); + vlog("Building %s( %s ) test\n", convertString, inName); + break; } fflush(stdout); @@ -1650,8 +1735,7 @@ static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, *outKernel = NULL; const char *flags = NULL; - if( gForceFTZ ) - flags = "-cl-denorms-are-zero"; + if (gForceFTZ) flags = "-cl-denorms-are-zero"; // build it std::string sourceString = source.str();