Remove almost duplicate compatibility common code

Use the non-compatibility version.

In each case the diff was minimal, didn't have modifications that
would invalidate compatibility testing and it was clear that the
"latest/best" version was not the one in the compatibility copy.

Signed-off-by: Kevin Petit <kevin.petit@arm.com>
This commit is contained in:
Kevin Petit
2019-08-07 11:24:11 +01:00
committed by Kévin Petit
parent fba5b654e8
commit 4cb8fc49f8
12 changed files with 8 additions and 2966 deletions

View File

@@ -446,6 +446,7 @@ void ThreadPool_Init(void)
// Check for manual override of multithreading code. We add this for better debuggability.
if( getenv( "CL_TEST_SINGLE_THREADED" ) )
{
log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n*** TEST IS INVALID! ***\n");
gThreadCount = 1;
return;
}

View File

@@ -1,899 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "ThreadPool.h"
#include "errorHelpers.h"
#include "fpcontrol.h"
#include <stdio.h>
#include <stdlib.h>
#if defined( __APPLE__ ) || defined( __linux__ ) || defined( _WIN32 ) // or any other POSIX system
#if defined( _WIN32 )
#include <windows.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#include "mingw_compat.h"
#include <process.h>
#else // !_WIN32
#include <pthread.h>
#include <unistd.h>
#include <sys/errno.h>
#endif // !_WIN32
// declarations
#ifdef _WIN32
void ThreadPool_WorkerFunc( void *p );
#else
void *ThreadPool_WorkerFunc( void *p );
#endif
void ThreadPool_Init(void);
void ThreadPool_Exit(void);
#if defined (__MINGW32__)
// Mutex for implementing super heavy atomic operations if you don't have GCC or MSVC
CRITICAL_SECTION gAtomicLock;
#elif defined( __GNUC__ ) || defined( _MSC_VER)
#else
pthread_mutex_t gAtomicLock;
#endif
// Atomic add operator with mem barrier. Mem barrier needed to protect state modified by the worker functions.
cl_int ThreadPool_AtomicAdd( volatile cl_int *a, cl_int b )
{
#if defined (__MINGW32__)
// No atomics on Mingw32
EnterCriticalSection(&gAtomicLock);
cl_int old = *a;
*a = old + b;
LeaveCriticalSection(&gAtomicLock);
return old;
#elif defined( __GNUC__ )
// GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
return __sync_fetch_and_add( a, b );
// do we need __sync_synchronize() here, too? GCC docs are unclear whether __sync_fetch_and_add does a synchronize
#elif defined( _MSC_VER )
return (cl_int) _InterlockedExchangeAdd( (volatile LONG*) a, (LONG) b );
#else
#warning Please add a atomic add implementation here, with memory barrier. Fallback code is slow.
if( pthread_mutex_lock(&gAtomicLock) )
log_error( "Atomic operation failed. pthread_mutex_lock(&gAtomicLock) returned an error\n");
cl_int old = *a;
*a = old + b;
if( pthread_mutex_unlock(&gAtomicLock) )
log_error( "Failed to release gAtomicLock. Further atomic operations may deadlock!\n");
return old;
#endif
}
#if defined( _WIN32 )
// Uncomment the following line if Windows XP support is not required.
// #define HAS_INIT_ONCE_EXECUTE_ONCE 1
#if defined(HAS_INIT_ONCE_EXECUTE_ONCE)
#define _INIT_ONCE INIT_ONCE
#define _PINIT_ONCE PINIT_ONCE
#define _InitOnceExecuteOnce InitOnceExecuteOnce
#else // !HAS_INIT_ONCE_EXECUTE_ONCE
typedef volatile LONG _INIT_ONCE;
typedef _INIT_ONCE *_PINIT_ONCE;
typedef BOOL (CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *);
#define _INIT_ONCE_UNINITIALIZED 0
#define _INIT_ONCE_IN_PROGRESS 1
#define _INIT_ONCE_DONE 2
static BOOL _InitOnceExecuteOnce(
_PINIT_ONCE InitOnce,
_PINIT_ONCE_FN InitFn,
PVOID Parameter,
LPVOID *Context
)
{
while ( *InitOnce != _INIT_ONCE_DONE )
{
if (*InitOnce != _INIT_ONCE_IN_PROGRESS && _InterlockedCompareExchange( InitOnce, _INIT_ONCE_IN_PROGRESS, _INIT_ONCE_UNINITIALIZED ) == _INIT_ONCE_UNINITIALIZED )
{
InitFn( InitOnce, Parameter, Context );
*InitOnce = _INIT_ONCE_DONE;
return TRUE;
}
Sleep( 1 );
}
return TRUE;
}
#endif // !HAS_INIT_ONCE_EXECUTE_ONCE
// Uncomment the following line if Windows XP support is not required.
// #define HAS_CONDITION_VARIABLE 1
#if defined(HAS_CONDITION_VARIABLE)
#define _CONDITION_VARIABLE CONDITION_VARIABLE
#define _InitializeConditionVariable InitializeConditionVariable
#define _SleepConditionVariableCS SleepConditionVariableCS
#define _WakeAllConditionVariable WakeAllConditionVariable
#else // !HAS_CONDITION_VARIABLE
typedef struct
{
HANDLE mEvent; // Used to park the thread.
CRITICAL_SECTION mLock[1]; // Used to protect mWaiters, mGeneration and mReleaseCount.
volatile cl_int mWaiters; // Number of threads waiting on this cond var.
volatile cl_int mGeneration; // Wait generation count.
volatile cl_int mReleaseCount; // Number of releases to execute before reseting the event.
} _CONDITION_VARIABLE;
typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE;
static void _InitializeConditionVariable( _PCONDITION_VARIABLE cond_var )
{
cond_var->mEvent = CreateEvent( NULL, TRUE, FALSE, NULL );
InitializeCriticalSection( cond_var->mLock );
cond_var->mWaiters = 0;
cond_var->mGeneration = 0;
#if !defined ( NDEBUG )
cond_var->mReleaseCount = 0;
#endif // !NDEBUG
}
static void _SleepConditionVariableCS( _PCONDITION_VARIABLE cond_var, PCRITICAL_SECTION cond_lock, DWORD ignored)
{
EnterCriticalSection( cond_var->mLock );
cl_int generation = cond_var->mGeneration;
++cond_var->mWaiters;
LeaveCriticalSection( cond_var->mLock );
LeaveCriticalSection( cond_lock );
while ( TRUE )
{
WaitForSingleObject( cond_var->mEvent, INFINITE );
EnterCriticalSection( cond_var->mLock );
BOOL done = cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation;
LeaveCriticalSection( cond_var->mLock );
if ( done )
{
break;
}
}
EnterCriticalSection( cond_lock );
EnterCriticalSection( cond_var->mLock );
if ( --cond_var->mReleaseCount == 0 )
{
ResetEvent( cond_var->mEvent );
}
--cond_var->mWaiters;
LeaveCriticalSection( cond_var->mLock );
}
static void _WakeAllConditionVariable( _PCONDITION_VARIABLE cond_var )
{
EnterCriticalSection( cond_var->mLock );
if (cond_var->mWaiters > 0 )
{
++cond_var->mGeneration;
cond_var->mReleaseCount = cond_var->mWaiters;
SetEvent( cond_var->mEvent );
}
LeaveCriticalSection( cond_var->mLock );
}
#endif // !HAS_CONDITION_VARIABLE
#endif // _WIN32
#define MAX_COUNT (1<<29)
// Global state to coordinate whether the threads have been launched successfully or not
#if defined( _MSC_VER ) && (_WIN32_WINNT >= 0x600)
static _INIT_ONCE threadpool_init_control;
#elif defined (_WIN32) // MingW of XP
static int threadpool_init_control;
#else // Posix platforms
pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT;
#endif
cl_int threadPoolInitErr = -1; // set to CL_SUCCESS on successful thread launch
// critical region lock around ThreadPool_Do. We can only run one ThreadPool_Do at a time,
// because we are too lazy to set up a queue here, and don't expect to need one.
#if defined( _WIN32 )
CRITICAL_SECTION gThreadPoolLock[1];
#else // !_WIN32
pthread_mutex_t gThreadPoolLock;
#endif // !_WIN32
// Condition variable to park ThreadPool threads when not working
#if defined( _WIN32 )
CRITICAL_SECTION cond_lock[1];
_CONDITION_VARIABLE cond_var[1];
#else // !_WIN32
pthread_mutex_t cond_lock;
pthread_cond_t cond_var;
#endif // !_WIN32
volatile cl_int gRunCount = 0; // Condition variable state. How many iterations on the function left to run.
// set to CL_INT_MAX to cause worker threads to exit. Note: this value might go negative.
// State that only changes when the threadpool is not working.
volatile TPFuncPtr gFunc_ptr = NULL;
volatile void *gUserInfo = NULL;
volatile cl_int gJobCount = 0;
// State that may change while the thread pool is working
volatile cl_int jobError = CL_SUCCESS; // err code return for the job as a whole
// Condition variable to park caller while waiting
#if defined( _WIN32 )
HANDLE caller_event;
#else // !_WIN32
pthread_mutex_t caller_cond_lock;
pthread_cond_t caller_cond_var;
#endif // !_WIN32
volatile cl_int gRunning = 0; // # of threads intended to be running. Running threads will decrement this as they discover they've run out of work to do.
// The total number of threads launched.
volatile cl_int gThreadCount = 0;
#ifdef _WIN32
void ThreadPool_WorkerFunc( void *p )
#else
void *ThreadPool_WorkerFunc( void *p )
#endif
{
cl_uint threadID = ThreadPool_AtomicAdd( (volatile cl_int *) p, 1 );
cl_int item = ThreadPool_AtomicAdd( &gRunCount, -1 );
// log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );
while( MAX_COUNT > item )
{
cl_int err;
// check for more work to do
if( 0 >= item )
{
// log_info( "Thread %d has run out of work.\n", threadID );
// No work to do. Attempt to block waiting for work
#if defined( _WIN32 )
EnterCriticalSection( cond_lock );
#else // !_WIN32
if((err = pthread_mutex_lock( &cond_lock) ))
{
log_error("Error %d from pthread_mutex_lock. Worker %d unable to block waiting for work. ThreadPool_WorkerFunc failed.\n", err, threadID );
goto exit;
}
#endif // !_WIN32
cl_int remaining = ThreadPool_AtomicAdd( &gRunning, -1 );
// log_info( "ThreadPool_WorkerFunc: gRunning = %d\n", remaining - 1 );
if( 1 == remaining )
{ // last thread out signal the main thread to wake up
#if defined( _WIN32 )
SetEvent( caller_event );
#else // !_WIN32
if((err = pthread_mutex_lock( &caller_cond_lock) ))
{
log_error("Error %d from pthread_mutex_lock. Unable to wake caller.\n", err );
goto exit;
}
if( (err = pthread_cond_broadcast( &caller_cond_var )))
{
log_error("Error %d from pthread_cond_broadcast. Unable to wake up main thread. ThreadPool_WorkerFunc failed.\n", err );
goto exit;
}
if((err = pthread_mutex_unlock( &caller_cond_lock) ))
{
log_error("Error %d from pthread_mutex_lock. Unable to wake caller.\n", err );
goto exit;
}
#endif // !_WIN32
}
// loop in case we are woken only to discover that some other thread already did all the work
while( 0 >= item )
{
#if defined( _WIN32 )
_SleepConditionVariableCS( cond_var, cond_lock, INFINITE );
#else // !_WIN32
if((err = pthread_cond_wait( &cond_var, &cond_lock) ))
{
log_error("Error %d from pthread_cond_wait. Unable to block for waiting for work. ThreadPool_WorkerFunc failed.\n", err );
pthread_mutex_unlock( &cond_lock);
goto exit;
}
#endif // !_WIN32
// try again to get a valid item id
item = ThreadPool_AtomicAdd( &gRunCount, -1 );
if( MAX_COUNT <= item ) // exit if we are done
{
#if defined( _WIN32 )
LeaveCriticalSection( cond_lock );
#else // !_WIN32
pthread_mutex_unlock( &cond_lock);
#endif // !_WIN32
goto exit;
}
}
ThreadPool_AtomicAdd( &gRunning, 1 );
// log_info( "Thread %d has found work.\n", threadID);
#if defined( _WIN32 )
LeaveCriticalSection( cond_lock );
#else // !_WIN32
if((err = pthread_mutex_unlock( &cond_lock) ))
{
log_error("Error %d from pthread_mutex_unlock. Unable to block for waiting for work. ThreadPool_WorkerFunc failed.\n", err );
goto exit;
}
#endif // !_WIN32
}
// we have a valid item, so do the work
if( CL_SUCCESS == jobError ) // but only if we haven't already encountered an error
{
// log_info( "Thread %d doing job %d\n", threadID, item - 1);
#if defined(__APPLE__) && defined(__arm__)
// On most platforms which support denorm, default is FTZ off. However,
// on some hardware where the reference is computed, default might be flush denorms to zero e.g. arm.
// This creates issues in result verification. Since spec allows the implementation to either flush or
// not flush denorms to zero, an implementation may choose not be flush i.e. return denorm result whereas
// reference result may be zero (flushed denorm). Hence we need to disable denorm flushing on host side
// where reference is being computed to make sure we get non-flushed reference result. If implementation
// returns flushed result, we correctly take care of that in verification code.
FPU_mode_type oldMode;
DisableFTZ( &oldMode );
#endif
// Call the user's function with this item ID
err = gFunc_ptr( item - 1, threadID, (void*) gUserInfo );
#if defined(__APPLE__) && defined(__arm__)
// Restore FP state
RestoreFPState( &oldMode );
#endif
if( err )
{
#if (__MINGW32__)
EnterCriticalSection(&gAtomicLock);
if( jobError == CL_SUCCESS );
jobError = err;
gRunCount = 0;
LeaveCriticalSection(&gAtomicLock);
#elif defined( __GNUC__ )
// GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
// set the new error if we are the first one there.
__sync_val_compare_and_swap( &jobError, CL_SUCCESS, err );
// drop run count to 0
gRunCount = 0;
__sync_synchronize();
#elif defined( _MSC_VER )
// set the new error if we are the first one there.
_InterlockedCompareExchange( (volatile LONG*) &jobError, err, CL_SUCCESS );
// drop run count to 0
gRunCount = 0;
_mm_mfence();
#else
if( pthread_mutex_lock(&gAtomicLock) )
log_error( "Atomic operation failed. pthread_mutex_lock(&gAtomicLock) returned an error\n");
if( jobError == CL_SUCCESS );
jobError = err;
gRunCount = 0;
if( pthread_mutex_unlock(&gAtomicLock) )
log_error( "Failed to release gAtomicLock. Further atomic operations may deadlock\n");
#endif
}
}
// get the next item
item = ThreadPool_AtomicAdd( &gRunCount, -1 );
}
exit:
log_info( "ThreadPool: thread %d exiting.\n", threadID );
ThreadPool_AtomicAdd( &gThreadCount, -1 );
#if !defined(_WIN32)
return NULL;
#endif
}
// SetThreadCount() may be used to artifically set the number of worker threads
// If the value is 0 (the default) the number of threads will be determined based on
// the number of CPU cores. If it is a unicore machine, then 2 will be used, so
// that we still get some testing for thread safety.
//
// If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then the
// code will run single threaded, but will report an error to indicate that the test
// is invalid. This option is intended for debugging purposes only. It is suggested
// as a convention that test apps set the thread count to 1 in response to the -m flag.
//
// SetThreadCount() must be called before the first call to GetThreadCount() or ThreadPool_Do(),
// otherwise the behavior is indefined.
void SetThreadCount( int count )
{
if( threadPoolInitErr == CL_SUCCESS )
{
log_error( "Error: It is illegal to set the thread count after the first call to ThreadPool_Do or GetThreadCount\n" );
abort();
}
gThreadCount = count;
}
void ThreadPool_Init(void)
{
cl_int i;
int err;
volatile cl_uint threadID = 0;
// Check for manual override of multithreading code. We add this for better debuggability.
if( getenv( "CL_TEST_SINGLE_THREADED" ) )
{
log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n*** TEST IS INVALID! ***\n");
gThreadCount = 1;
return;
}
// Figure out how many threads to run -- check first for non-zero to give the implementation the chance
if( 0 == gThreadCount )
{
#if defined(_MSC_VER) || defined (__MINGW64__)
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
DWORD length = 0;
GetLogicalProcessorInformation( NULL, &length );
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( length );
if( buffer != NULL && GetLogicalProcessorInformation( buffer, &length ) == TRUE )
{
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
while( ptr < &buffer[ length / sizeof( SYSTEM_LOGICAL_PROCESSOR_INFORMATION ) ] )
{
if( ptr->Relationship == RelationProcessorCore )
{
// Count the number of bits in ProcessorMask (number of logical cores)
ULONG mask = ptr->ProcessorMask;
while( mask )
{
++gThreadCount;
mask &= mask - 1; // Remove 1 bit at a time
}
}
++ptr;
}
free(buffer);
}
#elif defined (__MINGW32__)
{
#warning How about this, instead of hard coding it to 2?
SYSTEM_INFO sysinfo;
GetSystemInfo( &sysinfo );
gThreadCount = sysinfo.dwNumberOfProcessors;
}
#else // !_WIN32
gThreadCount = (cl_int) sysconf(_SC_NPROCESSORS_CONF); // Hopefully your system returns logical cpus here, as does MacOS X
#endif // !_WIN32
// Multithreaded tests are required to run multithreaded even on unicore systems so as to test thread safety
if( 1 == gThreadCount )
gThreadCount = 2;
}
//Allow the app to set thread count to <0 for debugging purposes. This will cause the test to run single threaded.
if( gThreadCount < 2 )
{
log_error( "ERROR: Running single threaded because thread count < 2. \n*** TEST IS INVALID! ***\n");
gThreadCount = 1;
return;
}
#if defined( _WIN32 )
InitializeCriticalSection( gThreadPoolLock );
InitializeCriticalSection( cond_lock );
_InitializeConditionVariable( cond_var );
caller_event = CreateEvent( NULL, FALSE, FALSE, NULL );
#elif defined (__GNUC__)
// Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since it might cause problem
// with some flavors of gcc compilers.
pthread_cond_init(&cond_var, NULL);
pthread_mutex_init(&cond_lock ,NULL);
pthread_cond_init(&caller_cond_var, NULL);
pthread_mutex_init(&caller_cond_lock, NULL);
pthread_mutex_init(&gThreadPoolLock, NULL);
#endif
#if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__))
pthread_mutex_initialize(gAtomicLock);
#elif defined (__MINGW32__)
InitializeCriticalSection(&gAtomicLock);
#endif
// Make sure the last thread done in the work pool doesn't signal us to wake before we get to the point where we are supposed to wait
// That would cause a deadlock.
#if !defined( _WIN32 )
if((err = pthread_mutex_lock( &caller_cond_lock) ))
{
log_error("Error %d from pthread_mutex_lock. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
gThreadCount = 1;
return;
}
#endif // !_WIN32
gRunning = gThreadCount;
// init threads
for( i = 0; i < gThreadCount; i++ )
{
#if defined( _WIN32 )
uintptr_t handle = _beginthread(ThreadPool_WorkerFunc, 0, (void*) &threadID);
err = ( handle == 0 );
#else // !_WIN32
pthread_t tid = 0;
err = pthread_create( &tid, NULL, ThreadPool_WorkerFunc, (void*) &threadID );
#endif // !_WIN32
if( err )
{
log_error( "Error %d launching thread %d\n", err, i );
threadPoolInitErr = err;
gThreadCount = i;
break;
}
}
atexit( ThreadPool_Exit );
// block until they are done launching.
do
{
#if defined( _WIN32 )
WaitForSingleObject( caller_event, INFINITE );
#else // !_WIN32
if((err = pthread_cond_wait( &caller_cond_var, &caller_cond_lock) ))
{
log_error("Error %d from pthread_cond_wait. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
pthread_mutex_unlock( &caller_cond_lock);
return;
}
#endif // !_WIN32
}
while( gRunCount != -gThreadCount );
#if !defined( _WIN32 )
if((err = pthread_mutex_unlock( &caller_cond_lock) ))
{
log_error("Error %d from pthread_mutex_unlock. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
return;
}
#endif // !_WIN32
threadPoolInitErr = CL_SUCCESS;
}
#if defined(_MSC_VER)
static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter, PVOID *lpContex)
{
ThreadPool_Init();
return TRUE;
}
#endif
void ThreadPool_Exit(void)
{
int err, count;
gRunCount = CL_INT_MAX;
#if defined( __GNUC__ )
// GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
__sync_synchronize();
#elif defined( _MSC_VER )
_mm_mfence();
#else
#warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
#endif
// spin waiting for threads to die
for (count = 0; 0 != gThreadCount && count < 1000; count++)
{
#if defined( _WIN32 )
_WakeAllConditionVariable( cond_var );
Sleep(1);
#else // !_WIN32
if( (err = pthread_cond_broadcast( &cond_var )))
{
log_error("Error %d from pthread_cond_broadcast. Unable to wake up work threads. ThreadPool_Exit failed.\n", err );
break;
}
usleep(1000);
#endif // !_WIN32
}
if( gThreadCount )
log_error( "Error: Thread pool timed out after 1 second with %d threads still active.\n", gThreadCount );
else
log_info( "Thread pool exited in a orderly fashion.\n" );
}
// Blocking API that farms out count jobs to a thread pool.
// It may return with some work undone if func_ptr() returns a non-zero
// result.
//
// This function obviously has its shortcommings. Only one call to ThreadPool_Do
// can be running at a time. It is not intended for general purpose use.
// If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were
// all available then it would make more sense to use those features.
cl_int ThreadPool_Do( TPFuncPtr func_ptr,
cl_uint count,
void *userInfo )
{
cl_int newErr;
cl_int err = 0;
// Lazily set up our threads
#if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
err = !_InitOnceExecuteOnce( &threadpool_init_control, _ThreadPool_Init, NULL, NULL );
#elif defined (_WIN32)
if (threadpool_init_control == 0) {
#warning This is buggy and race prone. Find a better way.
ThreadPool_Init();
threadpool_init_control = 1;
}
#else //posix platform
err = pthread_once( &threadpool_init_control, ThreadPool_Init );
if( err )
{
log_error("Error %d from pthread_once. Unable to init threads. ThreadPool_Do failed.\n", err );
return err;
}
#endif
// Single threaded code to handle case where threadpool wasn't allocated or was disabled by environment variable
if( threadPoolInitErr )
{
cl_uint currentJob = 0;
cl_int result = CL_SUCCESS;
#if defined(__APPLE__) && defined(__arm__)
// On most platforms which support denorm, default is FTZ off. However,
// on some hardware where the reference is computed, default might be flush denorms to zero e.g. arm.
// This creates issues in result verification. Since spec allows the implementation to either flush or
// not flush denorms to zero, an implementation may choose not be flush i.e. return denorm result whereas
// reference result may be zero (flushed denorm). Hence we need to disable denorm flushing on host side
// where reference is being computed to make sure we get non-flushed reference result. If implementation
// returns flushed result, we correctly take care of that in verification code.
FPU_mode_type oldMode;
DisableFTZ( &oldMode );
#endif
for( currentJob = 0; currentJob < count; currentJob++ )
if((result = func_ptr( currentJob, 0, userInfo )))
{
#if defined(__APPLE__) && defined(__arm__)
// Restore FP state before leaving
RestoreFPState( &oldMode );
#endif
return result;
}
#if defined(__APPLE__) && defined(__arm__)
// Restore FP state before leaving
RestoreFPState( &oldMode );
#endif
return CL_SUCCESS;
}
if( count >= MAX_COUNT )
{
log_error("Error: ThreadPool_Do count %d >= max threadpool count of %d\n", count, MAX_COUNT );
return -1;
}
// Enter critical region
#if defined( _WIN32 )
EnterCriticalSection( gThreadPoolLock );
#else // !_WIN32
if( (err = pthread_mutex_lock( &gThreadPoolLock )))
{
switch (err)
{
case EDEADLK:
log_error("Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do is not designed to work recursively!\n" );
break;
case EINVAL:
log_error("Error EINVAL returned in ThreadPool_Do(). How did we end up with an invalid gThreadPoolLock?\n" );
break;
default:
break;
}
return err;
}
#endif // !_WIN32
// Start modifying the job state observable by worker threads
#if defined( _WIN32 )
EnterCriticalSection( cond_lock );
#else // !_WIN32
if((err = pthread_mutex_lock( &cond_lock) ))
{
log_error("Error %d from pthread_mutex_lock. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
goto exit;
}
#endif // !_WIN32
// Make sure the last thread done in the work pool doesn't signal us to wake before we get to the point where we are supposed to wait
// That would cause a deadlock.
#if !defined( _WIN32 )
if((err = pthread_mutex_lock( &caller_cond_lock) ))
{
log_error("Error %d from pthread_mutex_lock. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
goto exit;
}
#endif // !_WIN32
// Prime the worker threads to get going
jobError = CL_SUCCESS;
gRunCount = gJobCount = count;
gFunc_ptr = func_ptr;
gUserInfo = userInfo;
#if defined( _WIN32 )
_WakeAllConditionVariable( cond_var );
LeaveCriticalSection( cond_lock );
#else // !_WIN32
if( (err = pthread_cond_broadcast( &cond_var )))
{
log_error("Error %d from pthread_cond_broadcast. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
goto exit;
}
if((err = pthread_mutex_unlock( &cond_lock) ))
{
log_error("Error %d from pthread_mutex_unlock. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
goto exit;
}
#endif // !_WIN32
// block until they are done. It would be slightly more efficient to do some of the work here though.
do
{
#if defined( _WIN32 )
WaitForSingleObject( caller_event, INFINITE );
#else // !_WIN32
if((err = pthread_cond_wait( &caller_cond_var, &caller_cond_lock) ))
{
log_error("Error %d from pthread_cond_wait. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
pthread_mutex_unlock( &caller_cond_lock);
goto exit;
}
#endif // !_WIN32
}
while( gRunning );
#if !defined(_WIN32)
if((err = pthread_mutex_unlock( &caller_cond_lock) ))
{
log_error("Error %d from pthread_mutex_unlock. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
goto exit;
}
#endif // !_WIN32
err = jobError;
exit:
// exit critical region
#if defined( _WIN32 )
LeaveCriticalSection( gThreadPoolLock );
#else // !_WIN32
newErr = pthread_mutex_unlock( &gThreadPoolLock );
if( newErr)
{
log_error("Error %d from pthread_mutex_unlock. Unable to exit critical region. ThreadPool_Do failed.\n", newErr );
return err;
}
#endif // !_WIN32
return err;
}
cl_uint GetThreadCount( void )
{
// Lazily set up our threads
#if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
cl_int err = !_InitOnceExecuteOnce( &threadpool_init_control, _ThreadPool_Init, NULL, NULL );
#elif defined (_WIN32)
if (threadpool_init_control == 0) {
#warning This is buggy and race prone. Find a better way.
ThreadPool_Init();
threadpool_init_control = 1;
}
#else
cl_int err = pthread_once( &threadpool_init_control, ThreadPool_Init );
if( err )
{
log_error("Error %d from pthread_once. Unable to init threads. ThreadPool_Do failed.\n", err );
return err;
}
#endif // !_WIN32
if( gThreadCount < 1 )
return 1;
return gThreadCount;
}
#else
#ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
#error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section.
#endif
//
// We require multithreading in parts of the test as a means of simultaneously testing reentrancy requirements
// of OpenCL API, while also checking
//
// A sample single threaded implementation follows, for documentation / bootstrapping purposes.
// It is not okay to use this for conformance testing!!!
//
// Exception: If your operating system does not support multithreaded execution of any kind, then you may use this code.
//
cl_int ThreadPool_AtomicAdd( volatile cl_int *a, cl_int b )
{
cl_uint r = *a;
// since this fallback code path is not multithreaded, we just do a regular add here
// If your operating system supports memory-barrier-atomics, use those here
*a = r + b;
return r;
}
// Blocking API that farms out count jobs to a thread pool.
// It may return with some work undone if func_ptr() returns a non-zero
// result.
cl_int ThreadPool_Do( TPFuncPtr func_ptr,
cl_uint count,
void *userInfo )
{
cl_uint currentJob = 0;
cl_int result = CL_SUCCESS;
#ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
// THIS FUNCTION IS NOT INTENDED FOR USE!!
log_error( "ERROR: Test must be multithreaded!\n" );
exit(-1);
#else
static int spewCount = 0;
if( 0 == spewCount )
{
log_info( "\nWARNING: The operating system is claimed not to support threads of any sort. Running single threaded.\n" );
spewCount = 1;
}
#endif
// The multithreaded code should mimic this behavior:
for( currentJob = 0; currentJob < count; currentJob++ )
if((result = func_ptr( currentJob, 0, userInfo )))
return result;
return CL_SUCCESS;
}
cl_uint GetThreadCount( void )
{
return 1;
}
void SetThreadCount( int count )
{
if( count > 1 )
log_info( "WARNING: SetThreadCount(%d) ignored\n", count );
}
#endif

View File

@@ -1,393 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/*
Header compat.h should be used instead of stdlib.h, stdbool.h, stdint.h, float.h, fenv.h,
math.h. It provides workarounds if these headers are not available or not complete.
Important: It should be included before math.h, directly or indirectly, because Intel mathimf.h
is not compatible with Microsoft math.h. Including math.h before mathimf.h causes compile-time
error.
*/
#ifndef _COMPAT_H_
#define _COMPAT_H_
#if defined(_WIN32) && defined (_MSC_VER)
#include <Windows.h>
#endif
#ifdef __cplusplus
#define EXTERN_C extern "C"
#else
#define EXTERN_C
#endif
//
// stdlib.h
//
#include <stdlib.h> // On Windows, _MAX_PATH defined there.
// llabs appeared in MS C v16 (VS 10/2010).
#if defined( _MSC_VER ) && _MSC_VER <= 1500
EXTERN_C inline long long llabs(long long __x) { return __x >= 0 ? __x : -__x; }
#endif
//
// stdbool.h
//
// stdbool.h appeared in MS C v18 (VS 12/2013).
#if defined( _MSC_VER ) && MSC_VER <= 1700
#if !defined(__cplusplus)
typedef char bool;
#define true 1
#define false 0
#endif
#else
#include <stdbool.h>
#endif
//
// stdint.h
//
// stdint.h appeared in MS C v16 (VS 10/2010) and Intel C v12.
#if defined( _MSC_VER ) && ( ! defined( __INTEL_COMPILER ) && _MSC_VER <= 1500 || defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 1200 )
typedef unsigned char uint8_t;
typedef char int8_t;
typedef unsigned short uint16_t;
typedef short int16_t;
typedef unsigned int uint32_t;
typedef int int32_t;
typedef unsigned long long uint64_t;
typedef long long int64_t;
#else
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS
#endif
#include <stdint.h>
#endif
//
// float.h
//
#include <float.h>
//
// fenv.h
//
// fenv.h appeared in MS C v18 (VS 12/2013).
#if defined( _MSC_VER ) && _MSC_VER <= 1700 && ! defined( __INTEL_COMPILER )
// reimplement fenv.h because windows doesn't have it
#define FE_INEXACT 0x0020
#define FE_UNDERFLOW 0x0010
#define FE_OVERFLOW 0x0008
#define FE_DIVBYZERO 0x0004
#define FE_INVALID 0x0001
#define FE_ALL_EXCEPT 0x003D
int fetestexcept(int excepts);
int feclearexcept(int excepts);
#else
#include <fenv.h>
#endif
//
// math.h
//
#if defined( __INTEL_COMPILER )
#include <mathimf.h>
#else
#include <math.h>
#endif
#if defined( _MSC_VER )
#ifdef __cplusplus
extern "C" {
#endif
#ifndef M_PI
#define M_PI 3.14159265358979323846264338327950288
#endif
#if ! defined( __INTEL_COMPILER )
#ifndef NAN
#define NAN (INFINITY - INFINITY)
#endif
#ifndef HUGE_VALF
#define HUGE_VALF (float)HUGE_VAL
#endif
#ifndef INFINITY
#define INFINITY (FLT_MAX + FLT_MAX)
#endif
#ifndef isfinite
#define isfinite(x) _finite(x)
#endif
#ifndef isnan
#define isnan( x ) ((x) != (x))
#endif
#ifndef isinf
#define isinf( _x) ((_x) == INFINITY || (_x) == -INFINITY)
#endif
double rint( double x);
float rintf( float x);
long double rintl( long double x);
float cbrtf( float );
double cbrt( double );
int ilogb( double x);
int ilogbf (float x);
int ilogbl(long double x);
double fmax(double x, double y);
double fmin(double x, double y);
float fmaxf( float x, float y );
float fminf(float x, float y);
double log2(double x);
long double log2l(long double x);
double exp2(double x);
long double exp2l(long double x);
double fdim(double x, double y);
float fdimf(float x, float y);
long double fdiml(long double x, long double y);
double remquo( double x, double y, int *quo);
float remquof( float x, float y, int *quo);
long double remquol( long double x, long double y, int *quo);
long double scalblnl(long double x, long n);
// snprintf added in _MSC_VER == 1900 (Visual Studio 2015)
#if defined( _MSC_VER ) && _MSC_VER < 1900
#define snprintf sprintf_s
#endif
float hypotf(float x, float y);
long double hypotl(long double x, long double y) ;
double lgamma(double x);
float lgammaf(float x);
double trunc(double x);
float truncf(float x);
double log1p(double x);
float log1pf(float x);
long double log1pl(long double x);
double copysign(double x, double y);
float copysignf(float x, float y);
long double copysignl(long double x, long double y);
long lround(double x);
long lroundf(float x);
//long lroundl(long double x)
double round(double x);
float roundf(float x);
long double roundl(long double x);
int cf_signbit(double x);
int cf_signbitf(float x);
// Added in _MSC_VER == 1800 (Visual Studio 2013)
#if defined( _MSC_VER ) && _MSC_VER < 1800
static int signbit(double x) { return cf_signbit(x); }
#endif
static int signbitf(float x) { return cf_signbitf(x); }
long int lrint (double flt);
long int lrintf (float flt);
float int2float (int32_t ix);
int32_t float2int (float fx);
#endif
#if ! defined( __INTEL_COMPILER ) || __INTEL_COMPILER < 1300
// These functions appeared in Intel C v13.
float nanf( const char* str);
double nan( const char* str);
long double nanl( const char* str);
#endif
#ifdef __cplusplus
}
#endif
#endif
#if defined( __ANDROID__ )
#define log2(X) (log(X)/log(2))
#endif
//
// stdio.h
//
//
// unistd.h
//
#if defined( _MSC_VER )
EXTERN_C unsigned int sleep( unsigned int sec );
EXTERN_C int usleep( int usec );
#endif
//
// syscall.h
//
#if defined( __ANDROID__ )
// Android bionic's isn't providing SYS_sysctl wrappers.
#define SYS__sysctl __NR__sysctl
#endif
// Some tests use _malloca which defined in malloc.h.
#if !defined (__APPLE__)
#include <malloc.h>
#endif
//
// ???
//
#if defined( _MSC_VER )
#define MAXPATHLEN _MAX_PATH
EXTERN_C uint64_t ReadTime( void );
EXTERN_C double SubtractTime( uint64_t endTime, uint64_t startTime );
/** Returns the number of leading 0-bits in x,
starting at the most significant bit position.
If x is 0, the result is undefined.
*/
EXTERN_C int __builtin_clz(unsigned int pattern);
#endif
#ifndef MIN
#define MIN(x,y) (((x)<(y))?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) (((x)>(y))?(x):(y))
#endif
/*
------------------------------------------------------------------------------------------------
WARNING: DO NOT USE THESE MACROS: MAKE_HEX_FLOAT, MAKE_HEX_DOUBLE, MAKE_HEX_LONG.
This is a typical usage of the macros:
double yhi = MAKE_HEX_DOUBLE(0x1.5555555555555p-2,0x15555555555555LL,-2);
(taken from math_brute_force/reference_math.c). There are two problems:
1. There is an error here. On Windows in will produce incorrect result
`0x1.5555555555555p+50'. To have a correct result it should be written as
`MAKE_HEX_DOUBLE(0x1.5555555555555p-2,0x15555555555555LL,-54)'. A proper value of the
third argument is not obvious -- sometimes it should be the same as exponent of the
first argument, but sometimes not.
2. Information is duplicated. It is easy to make a mistake.
Use HEX_FLT, HEX_DBL, HEX_LDBL macros instead (see them in the bottom of the file).
------------------------------------------------------------------------------------------------
*/
#if defined ( _MSC_VER ) && ! defined( __INTEL_COMPILER )
#define MAKE_HEX_FLOAT(x,y,z) ((float)ldexp( (float)(y), z))
#define MAKE_HEX_DOUBLE(x,y,z) ldexp( (double)(y), z)
#define MAKE_HEX_LONG(x,y,z) ((long double) ldexp( (long double)(y), z))
#else
// Do not use these macros in new code, use HEX_FLT, HEX_DBL, HEX_LDBL instead.
#define MAKE_HEX_FLOAT(x,y,z) x
#define MAKE_HEX_DOUBLE(x,y,z) x
#define MAKE_HEX_LONG(x,y,z) x
#endif
/*
------------------------------------------------------------------------------------------------
HEX_FLT, HEXT_DBL, HEX_LDBL -- Create hex floating point literal of type float, double, long
double respectively. Arguments:
sm -- sign of number,
int -- integer part of mantissa (without `0x' prefix),
fract -- fractional part of mantissa (without decimal point and `L' or `LL' suffixes),
se -- sign of exponent,
exp -- absolute value of (binary) exponent.
Example:
double yhi = HEX_DBL( +, 1, 5555555555555, -, 2 ); // == 0x1.5555555555555p-2
Note:
We have to pass signs as separate arguments because gcc pass negative integer values
(e. g. `-2') into a macro as two separate tokens, so `HEX_FLT( 1, 0, -2 )' produces result
`0x1.0p- 2' (note a space between minus and two) which is not a correct floating point
literal.
------------------------------------------------------------------------------------------------
*/
#if defined ( _MSC_VER ) && ! defined( __INTEL_COMPILER )
// If compiler does not support hex floating point literals:
#define HEX_FLT( sm, int, fract, se, exp ) sm ldexpf( (float)( 0x ## int ## fract ## UL ), se exp + ilogbf( (float) 0x ## int ) - ilogbf( ( float )( 0x ## int ## fract ## UL ) ) )
#define HEX_DBL( sm, int, fract, se, exp ) sm ldexp( (double)( 0x ## int ## fract ## ULL ), se exp + ilogb( (double) 0x ## int ) - ilogb( ( double )( 0x ## int ## fract ## ULL ) ) )
#define HEX_LDBL( sm, int, fract, se, exp ) sm ldexpl( (long double)( 0x ## int ## fract ## ULL ), se exp + ilogbl( (long double) 0x ## int ) - ilogbl( ( long double )( 0x ## int ## fract ## ULL ) ) )
#else
// If compiler supports hex floating point literals: just concatenate all the parts into a literal.
#define HEX_FLT( sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp ## F
#define HEX_DBL( sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp
#define HEX_LDBL( sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp ## L
#endif
#if defined(__MINGW32__)
#include <Windows.h>
#define sleep(sec) Sleep((sec) * 1000)
#endif
#endif // _COMPAT_H_

View File

@@ -1,164 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef _errorHelpers_h
#define _errorHelpers_h
#include <sstream>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
#define LOWER_IS_BETTER 0
#define HIGHER_IS_BETTER 1
// If USE_ATF is defined, all log_error and log_info calls can be routed to test library
// functions as described below. This is helpful for integration into an automated testing
// system.
#if USE_ATF
// export BUILD_WITH_ATF=1
#include <ATF/ATF.h>
#define test_start() ATFTestStart()
#define log_info ATFLogInfo
#define log_error ATFLogError
#define log_perf(_number, _higherBetter, _numType, _format, ...) ATFLogPerformanceNumber(_number, _higherBetter, _numType, _format, ##__VA_ARGS__)
#define test_finish() ATFTestFinish()
#define vlog_perf(_number, _higherBetter, _numType, _format, ...) ATFLogPerformanceNumber(_number, _higherBetter, _numType, _format,##__VA_ARGS__)
#define vlog ATFLogInfo
#define vlog_error ATFLogError
#else
#define test_start()
#define log_info printf
#define log_error printf
#define log_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, \
_higherBetter?"higher is better":"lower is better", _number )
#define test_finish()
#define vlog_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, \
_higherBetter?"higher is better":"lower is better" , _number)
#ifdef _WIN32
#ifdef __MINGW32__
// Use __mingw_printf since it supports "%a" format specifier
#define vlog __mingw_printf
#define vlog_error __mingw_printf
#else
// Use home-baked function that treats "%a" as "%f"
static int vlog_win32(const char *format, ...);
#define vlog vlog_win32
#define vlog_error vlog_win32
#endif
#else
#define vlog_error printf
#define vlog printf
#endif
#endif
#define ct_assert(b) ct_assert_i(b, __LINE__)
#define ct_assert_i(b, line) ct_assert_ii(b, line)
#define ct_assert_ii(b, line) int _compile_time_assertion_on_line_##line[b ? 1 : -1];
#define test_error(errCode,msg) test_error_ret(errCode,msg,errCode)
#define test_error_ret(errCode,msg,retValue) { if( errCode != CL_SUCCESS ) { print_error( errCode, msg ); return retValue ; } }
#define print_error(errCode,msg) log_error( "ERROR: %s! (%s from %s:%d)\n", msg, IGetErrorString( errCode ), __FILE__, __LINE__ );
// expected error code vs. what we got
#define test_failure_error(errCode, expectedErrCode, msg) test_failure_error_ret(errCode, expectedErrCode, msg, errCode != expectedErrCode)
#define test_failure_error_ret(errCode, expectedErrCode, msg, retValue) { if( errCode != expectedErrCode ) { print_failure_error( errCode, expectedErrCode, msg ); return retValue ; } }
#define print_failure_error(errCode, expectedErrCode, msg) log_error( "ERROR: %s! (Got %s, expected %s from %s:%d)\n", msg, IGetErrorString( errCode ), IGetErrorString( expectedErrCode ), __FILE__, __LINE__ );
#define test_failure_warning(errCode, expectedErrCode, msg) test_failure_warning_ret(errCode, expectedErrCode, msg, errCode != expectedErrCode)
#define test_failure_warning_ret(errCode, expectedErrCode, msg, retValue) { if( errCode != expectedErrCode ) { print_failure_warning( errCode, expectedErrCode, msg ); warnings++ ; } }
#define print_failure_warning(errCode, expectedErrCode, msg) log_error( "WARNING: %s! (Got %s, expected %s from %s:%d)\n", msg, IGetErrorString( errCode ), IGetErrorString( expectedErrCode ), __FILE__, __LINE__ );
#define ASSERT_SUCCESS(expr, msg) \
do \
{ \
cl_int _temp_retval = (expr); \
if (_temp_retval != CL_SUCCESS) \
{ \
std::stringstream ss; \
ss << "ERROR: " << msg << "=" << IGetErrorString(_temp_retval) \
<< " at " << __FILE__ << ":" << __LINE__ << "\n"; \
throw std::runtime_error(ss.str()); \
} \
} while (0)
extern const char *IGetErrorString( int clErrorCode );
extern float Ulp_Error_Half( cl_ushort test, float reference );
extern float Ulp_Error( float test, double reference );
extern float Ulp_Error_Double( double test, long double reference );
extern const char *GetChannelTypeName( cl_channel_type type );
extern int IsChannelTypeSupported( cl_channel_type type );
extern const char *GetChannelOrderName( cl_channel_order order );
extern int IsChannelOrderSupported( cl_channel_order order );
extern const char *GetAddressModeName( cl_addressing_mode mode );
extern const char *GetDeviceTypeName( cl_device_type type );
// NON-REENTRANT UNLESS YOU PROVIDE A BUFFER PTR (pass null to use static storage, but it's not reentrant then!)
extern const char *GetDataVectorString( void *dataBuffer, size_t typeSize, size_t vecSize, char *buffer );
#if defined (_WIN32) && !defined(__MINGW32__)
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
static int vlog_win32(const char *format, ...)
{
const char *new_format = format;
if (strstr(format, "%a")) {
char *temp;
if ((temp = strdup(format)) == NULL) {
printf("vlog_win32: Failed to allocate memory for strdup\n");
return -1;
}
new_format = temp;
while (*temp) {
// replace %a with %f
if ((*temp == '%') && (*(temp+1) == 'a')) {
*(temp+1) = 'f';
}
temp++;
}
}
va_list args;
va_start(args, format);
vprintf(new_format, args);
va_end(args);
if (new_format != format) {
free((void*)new_format);
}
return 0;
}
#endif
#ifdef __cplusplus
}
#endif
#endif // _errorHelpers_h

View File

@@ -1,104 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef _fpcontrol_h
#define _fpcontrol_h
// In order to get tests for correctly rounded operations (e.g. multiply) to work properly we need to be able to set the reference hardware
// to FTZ mode if the device hardware is running in that mode. We have explored all other options short of writing correctly rounded operations
// in integer code, and have found this is the only way to correctly verify operation.
//
// Non-Apple implementations will need to provide their own implentation for these features. If the reference hardware and device are both
// running in the same state (either FTZ or IEEE compliant modes) then these functions may be empty. If the device is running in non-default
// rounding mode (e.g. round toward zero), then these functions should also set the reference device into that rounding mode.
#if defined( __APPLE__ ) || defined( _MSC_VER ) || defined( __linux__ ) || defined (__MINGW32__)
typedef int FPU_mode_type;
#if defined( __i386__ ) || defined( __x86_64__ )
#include <xmmintrin.h>
#elif defined( __PPC__ )
#include <fpu_control.h>
extern __thread fpu_control_t fpu_control;
#endif
// Set the reference hardware floating point unit to FTZ mode
static inline void ForceFTZ( FPU_mode_type *mode )
{
#if defined( __i386__ ) || defined( __x86_64__ ) || defined( _MSC_VER ) || defined (__MINGW32__)
*mode = _mm_getcsr();
_mm_setcsr( *mode | 0x8040);
#elif defined( __PPC__ )
*mode = fpu_control;
fpu_control |= _FPU_MASK_NI;
#elif defined ( __arm__ )
unsigned fpscr;
__asm__ volatile ("fmrx %0, fpscr" : "=r"(fpscr));
*mode = fpscr;
__asm__ volatile ("fmxr fpscr, %0" :: "r"(fpscr | (1U << 24)));
// Add 64 bit support
#elif defined (__aarch64__)
unsigned fpscr;
__asm__ volatile ("mrs %0, fpcr" : "=r"(fpscr));
*mode = fpscr;
__asm__ volatile ("msr fpcr, %0" :: "r"(fpscr | (1U << 24)));
#else
#error ForceFTZ needs an implentation
#endif
}
// Disable the denorm flush to zero
static inline void DisableFTZ( FPU_mode_type *mode )
{
#if defined( __i386__ ) || defined( __x86_64__ ) || defined( _MSC_VER ) || defined (__MINGW32__)
*mode = _mm_getcsr();
_mm_setcsr( *mode & ~0x8040);
#elif defined( __PPC__ )
*mode = fpu_control;
fpu_control &= ~_FPU_MASK_NI;
#elif defined ( __arm__ )
unsigned fpscr;
__asm__ volatile ("fmrx %0, fpscr" : "=r"(fpscr));
*mode = fpscr;
__asm__ volatile ("fmxr fpscr, %0" :: "r"(fpscr & ~(1U << 24)));
// Add 64 bit support
#elif defined (__aarch64__)
unsigned fpscr;
__asm__ volatile ("mrs %0, fpcr" : "=r"(fpscr));
*mode = fpscr;
__asm__ volatile ("msr fpcr, %0" :: "r"(fpscr & ~(1U << 24)));
#else
#error DisableFTZ needs an implentation
#endif
}
// Restore the reference hardware to floating point state indicated by *mode
static inline void RestoreFPState( FPU_mode_type *mode )
{
#if defined( __i386__ ) || defined( __x86_64__ ) || defined( _MSC_VER ) || defined (__MINGW32__)
_mm_setcsr( *mode );
#elif defined( __PPC__)
fpu_control = *mode;
#elif defined (__arm__)
__asm__ volatile ("fmxr fpscr, %0" :: "r"(*mode));
// Add 64 bit support
#elif defined (__aarch64__)
__asm__ volatile ("msr fpcr, %0" :: "r"(*mode));
#else
#error RestoreFPState needs an implementation
#endif
}
#else
#error ForceFTZ and RestoreFPState need implentations
#endif
#endif

View File

@@ -1,773 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "compat.h"
#if defined ( _MSC_VER )
#include <limits.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <windows.h>
#if ! defined( __INTEL_COMPILER )
///////////////////////////////////////////////////////////////////
//
// rint, rintf
//
///////////////////////////////////////////////////////////////////
float copysignf( float x, float y )
{
union{ cl_uint u; float f; }ux, uy;
ux.f = x;
uy.f = y;
ux.u = (ux.u & 0x7fffffffU) | (uy.u & 0x80000000U);
return ux.f;
}
double copysign( double x, double y )
{
union{ cl_ulong u; double f; }ux, uy;
ux.f = x;
uy.f = y;
ux.u = (ux.u & 0x7fffffffffffffffULL) | (uy.u & 0x8000000000000000ULL);
return ux.f;
}
long double copysignl( long double x, long double y )
{
union
{
long double f;
struct{ cl_ulong m; cl_ushort sexp; }u;
}ux, uy;
ux.f = x;
uy.f = y;
ux.u.sexp = (ux.u.sexp & 0x7fff) | (uy.u.sexp & 0x8000);
return ux.f;
}
float rintf(float x)
{
float absx = fabsf(x);
if( absx < 8388608.0f /* 0x1.0p23f */ )
{
float magic = copysignf( 8388608.0f /* 0x1.0p23f */, x );
float rounded = x + magic;
rounded -= magic;
x = copysignf( rounded, x );
}
return x;
}
double rint(double x)
{
double absx = fabs(x);
if( absx < 4503599627370496.0 /* 0x1.0p52f */ )
{
double magic = copysign( 4503599627370496.0 /* 0x1.0p52 */, x );
double rounded = x + magic;
rounded -= magic;
x = copysign( rounded, x );
}
return x;
}
long double rintl(long double x)
{
double absx = fabs(x);
if( absx < 9223372036854775808.0L /* 0x1.0p64f */ )
{
long double magic = copysignl( 9223372036854775808.0L /* 0x1.0p63L */, x );
long double rounded = x + magic;
rounded -= magic;
x = copysignl( rounded, x );
}
return x;
}
///////////////////////////////////////////////////////////////////
//
// ilogb, ilogbf, ilogbl
//
///////////////////////////////////////////////////////////////////
#ifndef FP_ILOGB0
#define FP_ILOGB0 INT_MIN
#endif
#ifndef FP_ILOGBNAN
#define FP_ILOGBNAN INT_MIN
#endif
int ilogb (double x)
{
union{ double f; cl_ulong u;} u;
u.f = x;
cl_ulong absx = u.u & CL_LONG_MAX;
if( absx - 0x0001000000000000ULL >= 0x7ff0000000000000ULL - 0x0001000000000000ULL)
{
switch( absx )
{
case 0:
return FP_ILOGB0;
case 0x7ff0000000000000ULL:
return INT_MAX;
default:
if( absx > 0x7ff0000000000000ULL )
return FP_ILOGBNAN;
// subnormal
u.u = absx | 0x3ff0000000000000ULL;
u.f -= 1.0;
return (u.u >> 52) - (1023 + 1022);
}
}
return (absx >> 52) - 1023;
}
int ilogbf (float x)
{
union{ float f; cl_uint u;} u;
u.f = x;
cl_uint absx = u.u & 0x7fffffff;
if( absx - 0x00800000U >= 0x7f800000U - 0x00800000U)
{
switch( absx )
{
case 0:
return FP_ILOGB0;
case 0x7f800000U:
return INT_MAX;
default:
if( absx > 0x7f800000 )
return FP_ILOGBNAN;
// subnormal
u.u = absx | 0x3f800000U;
u.f -= 1.0f;
return (u.u >> 23) - (127 + 126);
}
}
return (absx >> 23) - 127;
}
int ilogbl (long double x)
{
union
{
long double f;
struct{ cl_ulong m; cl_ushort sexp; }u;
} u;
u.f = x;
int exp = u.u.sexp & 0x7fff;
if( 0 == exp )
{
if( 0 == u.u.m )
return FP_ILOGB0;
//subnormal
u.u.sexp = 0x3fff;
u.f -= 1.0f;
exp = u.u.sexp & 0x7fff;
return exp - (0x3fff + 0x3ffe);
}
else if( 0x7fff == exp )
{
if( u.u.m & CL_LONG_MAX )
return FP_ILOGBNAN;
return INT_MAX;
}
return exp - 0x3fff;
}
///////////////////////////////////////////////////////////////////
//
// fmax, fmin, fmaxf, fminf
//
///////////////////////////////////////////////////////////////////
static void GET_BITS_SP32(float fx, unsigned int* ux)
{
volatile union {float f; unsigned int u;} _bitsy;
_bitsy.f = (fx);
*ux = _bitsy.u;
}
/* static void GET_BITS_SP32(float fx, unsigned int* ux) */
/* { */
/* volatile union {float f; unsigned int i;} _bitsy; */
/* _bitsy.f = (fx); */
/* *ux = _bitsy.i; */
/* } */
static void PUT_BITS_SP32(unsigned int ux, float* fx)
{
volatile union {float f; unsigned int u;} _bitsy;
_bitsy.u = (ux);
*fx = _bitsy.f;
}
/* static void PUT_BITS_SP32(unsigned int ux, float* fx) */
/* { */
/* volatile union {float f; unsigned int i;} _bitsy; */
/* _bitsy.i = (ux); */
/* *fx = _bitsy.f; */
/* } */
static void GET_BITS_DP64(double dx, unsigned __int64* lx)
{
volatile union {double d; unsigned __int64 l;} _bitsy;
_bitsy.d = (dx);
*lx = _bitsy.l;
}
static void PUT_BITS_DP64(unsigned __int64 lx, double* dx)
{
volatile union {double d; unsigned __int64 l;} _bitsy;
_bitsy.l = (lx);
*dx = _bitsy.d;
}
#if 0
int SIGNBIT_DP64(double x )
{
int hx;
_GET_HIGH_WORD(hx,x);
return((hx>>31));
}
#endif
/* fmax(x, y) returns the larger (more positive) of x and y.
NaNs are treated as missing values: if one argument is NaN,
the other argument is returned. If both arguments are NaN,
the first argument is returned. */
/* This works so long as the compiler knows that (x != x) means
that x is NaN; gcc does. */
double fmax(double x, double y)
{
if( isnan(y) )
return x;
return x >= y ? x : y;
}
/* fmin(x, y) returns the smaller (more negative) of x and y.
NaNs are treated as missing values: if one argument is NaN,
the other argument is returned. If both arguments are NaN,
the first argument is returned. */
double fmin(double x, double y)
{
if( isnan(y) )
return x;
return x <= y ? x : y;
}
float fmaxf( float x, float y )
{
if( isnan(y) )
return x;
return x >= y ? x : y;
}
/* fminf(x, y) returns the smaller (more negative) of x and y.
NaNs are treated as missing values: if one argument is NaN,
the other argument is returned. If both arguments are NaN,
the first argument is returned. */
float fminf(float x, float y)
{
if( isnan(y) )
return x;
return x <= y ? x : y;
}
long double scalblnl(long double x, long n)
{
union
{
long double d;
struct{ cl_ulong m; cl_ushort sexp;}u;
}u;
u.u.m = CL_LONG_MIN;
if( x == 0.0L || n < -2200)
return copysignl( 0.0L, x );
if( n > 2200 )
return INFINITY;
if( n < 0 )
{
u.u.sexp = 0x3fff - 1022;
while( n <= -1022 )
{
x *= u.d;
n += 1022;
}
u.u.sexp = 0x3fff + n;
x *= u.d;
return x;
}
if( n > 0 )
{
u.u.sexp = 0x3fff + 1023;
while( n >= 1023 )
{
x *= u.d;
n -= 1023;
}
u.u.sexp = 0x3fff + n;
x *= u.d;
return x;
}
return x;
}
///////////////////////////////////////////////////////////////////
//
// log2
//
///////////////////////////////////////////////////////////////////
const static cl_double log_e_base2 = 1.4426950408889634074;
const static cl_double log_10_base2 = 3.3219280948873623478;
//double log10(double x);
double log2(double x)
{
return 1.44269504088896340735992468100189214 * log(x);
}
long double log2l(long double x)
{
return 1.44269504088896340735992468100189214L * log(x);
}
double trunc(double x)
{
double absx = fabs(x);
if( absx < 4503599627370496.0 /* 0x1.0p52f */ )
{
cl_long rounded = x;
x = copysign( (double) rounded, x );
}
return x;
}
float truncf(float x)
{
float absx = fabsf(x);
if( absx < 8388608.0f /* 0x1.0p23f */ )
{
cl_int rounded = x;
x = copysignf( (float) rounded, x );
}
return x;
}
long lround(double x)
{
double absx = fabs(x);
if( absx < 0.5 )
return 0;
if( absx < 4503599627370496.0 /* 0x1.0p52 */)
{
absx += 0.5;
cl_long rounded = absx;
absx = rounded;
x = copysign( absx, x );
}
if( x >= (double) LONG_MAX )
return LONG_MAX;
return (long) x;
}
long lroundf(float x)
{
float absx = fabsf(x);
if( absx < 0.5f )
return 0;
if( absx < 8388608.0f )
{
absx += 0.5f;
cl_int rounded = absx;
absx = rounded;
x = copysignf( absx, x );
}
if( x >= (float) LONG_MAX )
return LONG_MAX;
return (long) x;
}
double round(double x)
{
double absx = fabs(x);
if( absx < 0.5 )
return copysign( 0.0, x);
if( absx < 4503599627370496.0 /* 0x1.0p52 */)
{
absx += 0.5;
cl_long rounded = absx;
absx = rounded;
x = copysign( absx, x );
}
return x;
}
float roundf(float x)
{
float absx = fabsf(x);
if( absx < 0.5f )
return copysignf( 0.0f, x);
if( absx < 8388608.0f )
{
absx += 0.5f;
cl_int rounded = absx;
absx = rounded;
x = copysignf( absx, x );
}
return x;
}
long double roundl(long double x)
{
long double absx = fabsl(x);
if( absx < 0.5L )
return copysignl( 0.0L, x);
if( absx < 9223372036854775808.0L /*0x1.0p63L*/ )
{
absx += 0.5L;
cl_ulong rounded = absx;
absx = rounded;
x = copysignl( absx, x );
}
return x;
}
float cbrtf( float x )
{
float z = pow( fabs((double) x), 1.0 / 3.0 );
return copysignf( z, x );
}
double cbrt( double x )
{
return copysign( pow( fabs( x ), 1.0 / 3.0 ), x );
}
long int lrint (double x)
{
double absx = fabs(x);
if( x >= (double) LONG_MAX )
return LONG_MAX;
if( absx < 4503599627370496.0 /* 0x1.0p52 */ )
{
double magic = copysign( 4503599627370496.0 /* 0x1.0p52 */, x );
double rounded = x + magic;
rounded -= magic;
return (long int) rounded;
}
return (long int) x;
}
long int lrintf (float x)
{
float absx = fabsf(x);
if( x >= (float) LONG_MAX )
return LONG_MAX;
if( absx < 8388608.0f /* 0x1.0p23f */ )
{
float magic = copysignf( 8388608.0f /* 0x1.0p23f */, x );
float rounded = x + magic;
rounded -= magic;
return (long int) rounded;
}
return (long int) x;
}
///////////////////////////////////////////////////////////////////
//
// fenv functions
//
///////////////////////////////////////////////////////////////////
#if _MSC_VER < 1900
int fetestexcept(int excepts)
{
unsigned int status = _statusfp();
return excepts & (
((status & _SW_INEXACT) ? FE_INEXACT : 0) |
((status & _SW_UNDERFLOW) ? FE_UNDERFLOW : 0) |
((status & _SW_OVERFLOW) ? FE_OVERFLOW : 0) |
((status & _SW_ZERODIVIDE) ? FE_DIVBYZERO : 0) |
((status & _SW_INVALID) ? FE_INVALID : 0)
);
}
int feclearexcept(int excepts)
{
_clearfp();
return 0;
}
#endif
#endif // __INTEL_COMPILER
#if ! defined( __INTEL_COMPILER ) || __INTEL_COMPILER < 1300
float make_nan()
{
/* This is the IEEE 754 single-precision format:
unsigned int mantissa: 22;
unsigned int quiet_nan: 1;
unsigned int exponent: 8;
unsigned int negative: 1;
*/
//const static unsigned
static const int32_t _nan = 0x7fc00000;
return *(const float*)(&_nan);
}
float nanf( const char* str)
{
cl_uint u = atoi( str );
u |= 0x7fc00000U;
return *( float*)(&u);
}
double nan( const char* str)
{
cl_ulong u = atoi( str );
u |= 0x7ff8000000000000ULL;
return *( double*)(&u);
}
// double check this implementatation
long double nanl( const char* str)
{
union
{
long double f;
struct { cl_ulong m; cl_ushort sexp; }u;
}u;
u.u.sexp = 0x7fff;
u.u.m = 0x8000000000000000ULL | atoi( str );
return u.f;
}
#endif
///////////////////////////////////////////////////////////////////
//
// misc functions
//
///////////////////////////////////////////////////////////////////
/*
// This function is commented out because the Windows implementation should never call munmap.
// If it is calling it, we have a bug. Please file a bugzilla.
int munmap(void *addr, size_t len)
{
// FIXME: this is not correct. munmap is like free() http://www.opengroup.org/onlinepubs/7990989775/xsh/munmap.html
return (int)VirtualAlloc( (LPVOID)addr, len,
MEM_COMMIT|MEM_RESERVE, PAGE_NOACCESS );
}
*/
uint64_t ReadTime( void )
{
LARGE_INTEGER current;
QueryPerformanceCounter(&current);
return (uint64_t)current.QuadPart;
}
double SubtractTime( uint64_t endTime, uint64_t startTime )
{
static double PerformanceFrequency = 0.0;
if (PerformanceFrequency == 0.0) {
LARGE_INTEGER frequency;
QueryPerformanceFrequency(&frequency);
PerformanceFrequency = (double) frequency.QuadPart;
}
return (double)(endTime - startTime) / PerformanceFrequency * 1e9;
}
int cf_signbit(double x)
{
union
{
double f;
cl_ulong u;
}u;
u.f = x;
return u.u >> 63;
}
int cf_signbitf(float x)
{
union
{
float f;
cl_uint u;
}u;
u.f = x;
return u.u >> 31;
}
float int2float (int32_t ix)
{
union {
float f;
int32_t i;
} u;
u.i = ix;
return u.f;
}
int32_t float2int (float fx)
{
union {
float f;
int32_t i;
} u;
u.f = fx;
return u.i;
}
#if !defined(_WIN64)
/** Returns the number of leading 0-bits in x,
starting at the most significant bit position.
If x is 0, the result is undefined.
*/
int __builtin_clz(unsigned int pattern)
{
#if 0
int res;
__asm {
mov eax, pattern
bsr eax, eax
mov res, eax
}
return 31 - res;
#endif
unsigned long index;
unsigned char res = _BitScanReverse( &index, pattern);
if (res) {
return 8*sizeof(int) - 1 - index;
} else {
return 8*sizeof(int);
}
}
#else
int __builtin_clz(unsigned int pattern)
{
int count;
if (pattern == 0u) {
return 32;
}
count = 31;
if (pattern >= 1u<<16) { pattern >>= 16; count -= 16; }
if (pattern >= 1u<<8) { pattern >>= 8; count -= 8; }
if (pattern >= 1u<<4) { pattern >>= 4; count -= 4; }
if (pattern >= 1u<<2) { pattern >>= 2; count -= 2; }
if (pattern >= 1u<<1) { count -= 1; }
return count;
}
#endif // !defined(_WIN64)
#include <intrin.h>
#include <emmintrin.h>
int usleep(int usec)
{
Sleep((usec + 999) / 1000);
return 0;
}
unsigned int sleep( unsigned int sec )
{
Sleep( sec * 1000 );
return 0;
}
#endif // defined( _MSC_VER )

View File

@@ -1,274 +0,0 @@
/*
A C-program for MT19937, with initialization improved 2002/1/26.
Coded by Takuji Nishimura and Makoto Matsumoto.
Before using, initialize the state by using init_genrand(seed)
or init_by_array(init_key, key_length).
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Any feedback is very welcome.
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
Modifications for use in OpenCL by Ian Ollmann, Apple Inc.
*/
#include <stdio.h>
#include <stdlib.h>
#include "mt19937.h"
#include "mingw_compat.h"
#ifdef __SSE2__
#include <emmintrin.h>
#endif
static void * align_malloc(size_t size, size_t alignment)
{
#if defined(_WIN32) && defined(_MSC_VER)
return _aligned_malloc(size, alignment);
#elif defined(__linux__) || defined (linux) || defined(__APPLE__)
void * ptr = NULL;
if (0 == posix_memalign(&ptr, alignment, size))
return ptr;
return NULL;
#elif defined(__MINGW32__)
return __mingw_aligned_malloc(size, alignment);
#else
#error "Please add support OS for aligned malloc"
#endif
}
static void align_free(void * ptr)
{
#if defined(_WIN32) && defined(_MSC_VER)
_aligned_free(ptr);
#elif defined(__linux__) || defined (linux) || defined(__APPLE__)
return free(ptr);
#elif defined(__MINGW32__)
return __mingw_aligned_free(ptr);
#else
#error "Please add support OS for aligned free"
#endif
}
/* Period parameters */
#define N 624 /* vector code requires multiple of 4 here */
#define M 397
#define MATRIX_A (cl_uint) 0x9908b0dfUL /* constant vector a */
#define UPPER_MASK (cl_uint) 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK (cl_uint) 0x7fffffffUL /* least significant r bits */
typedef struct _MTdata
{
cl_uint mt[N];
#ifdef __SSE2__
cl_uint cache[N];
#endif
cl_int mti;
}_MTdata;
/* initializes mt[N] with a seed */
MTdata init_genrand(cl_uint s)
{
MTdata r = (MTdata) align_malloc( sizeof( _MTdata ), 16 );
if( NULL != r )
{
cl_uint *mt = r->mt;
int mti = 0;
mt[0]= s; // & 0xffffffffUL;
for (mti=1; mti<N; mti++) {
mt[mti] = (cl_uint)
(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array mt[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
// mt[mti] &= 0xffffffffUL;
/* for >32 bit machines */
}
r->mti = mti;
}
return r;
}
void free_mtdata( MTdata d )
{
if(d)
align_free(d);
}
/* generates a random number on [0,0xffffffff]-interval */
cl_uint genrand_int32( MTdata d)
{
/* mag01[x] = x * MATRIX_A for x=0,1 */
static const cl_uint mag01[2]={0x0UL, MATRIX_A};
#ifdef __SSE2__
static volatile int init = 0;
static union{ __m128i v; cl_uint s[4]; } upper_mask, lower_mask, one, matrix_a, c0, c1;
#endif
cl_uint *mt = d->mt;
cl_uint y;
if (d->mti == N)
{ /* generate N words at one time */
int kk;
#ifdef __SSE2__
if( 0 == init )
{
upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] = upper_mask.s[3] = UPPER_MASK;
lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] = lower_mask.s[3] = LOWER_MASK;
one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1;
matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] = MATRIX_A;
c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint) 0x9d2c5680UL;
c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint) 0xefc60000UL;
init = 1;
}
#endif
kk = 0;
#ifdef __SSE2__
// vector loop
for( ; kk + 4 <= N-M; kk += 4 )
{
__m128i vy = _mm_or_si128( _mm_and_si128( _mm_load_si128( (__m128i*)(mt + kk) ), upper_mask.v ),
_mm_and_si128( _mm_loadu_si128( (__m128i*)(mt + kk + 1) ), lower_mask.v )); // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
__m128i mask = _mm_cmpeq_epi32( _mm_and_si128( vy, one.v), one.v ); // y & 1 ? -1 : 0
__m128i vmag01 = _mm_and_si128( mask, matrix_a.v ); // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
__m128i vr = _mm_xor_si128( _mm_loadu_si128( (__m128i*)(mt + kk + M)), (__m128i) _mm_srli_epi32( vy, 1 ) ); // mt[kk+M] ^ (y >> 1)
vr = _mm_xor_si128( vr, vmag01 ); // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
_mm_store_si128( (__m128i*) (mt + kk ), vr );
}
#endif
for ( ;kk<N-M;kk++) {
y = (cl_uint) ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK));
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
}
#ifdef __SSE2__
// advance to next aligned location
for (;kk<N-1 && (kk & 3);kk++) {
y = (cl_uint) ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK));
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
}
// vector loop
for( ; kk + 4 <= N-1; kk += 4 )
{
__m128i vy = _mm_or_si128( _mm_and_si128( _mm_load_si128( (__m128i*)(mt + kk) ), upper_mask.v ),
_mm_and_si128( _mm_loadu_si128( (__m128i*)(mt + kk + 1) ), lower_mask.v )); // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
__m128i mask = _mm_cmpeq_epi32( _mm_and_si128( vy, one.v), one.v ); // y & 1 ? -1 : 0
__m128i vmag01 = _mm_and_si128( mask, matrix_a.v ); // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
__m128i vr = _mm_xor_si128( _mm_loadu_si128( (__m128i*)(mt + kk + M - N)), _mm_srli_epi32( vy, 1 ) ); // mt[kk+M-N] ^ (y >> 1)
vr = _mm_xor_si128( vr, vmag01 ); // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
_mm_store_si128( (__m128i*) (mt + kk ), vr );
}
#endif
for (;kk<N-1;kk++) {
y = (cl_uint) ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK));
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
}
y = (cl_uint)((mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK));
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
#ifdef __SSE2__
// Do the tempering ahead of time in vector code
for( kk = 0; kk + 4 <= N; kk += 4 )
{
__m128i vy = _mm_load_si128( (__m128i*)(mt + kk ) ); // y = mt[k];
vy = _mm_xor_si128( vy, _mm_srli_epi32( vy, 11 ) ); // y ^= (y >> 11);
vy = _mm_xor_si128( vy, _mm_and_si128( _mm_slli_epi32( vy, 7 ), c0.v) ); // y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
vy = _mm_xor_si128( vy, _mm_and_si128( _mm_slli_epi32( vy, 15 ), c1.v) ); // y ^= (y << 15) & (cl_uint) 0xefc60000UL;
vy = _mm_xor_si128( vy, _mm_srli_epi32( vy, 18 ) ); // y ^= (y >> 18);
_mm_store_si128( (__m128i*)(d->cache+kk), vy );
}
#endif
d->mti = 0;
}
#ifdef __SSE2__
y = d->cache[d->mti++];
#else
y = mt[d->mti++];
/* Tempering */
y ^= (y >> 11);
y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
y ^= (y << 15) & (cl_uint) 0xefc60000UL;
y ^= (y >> 18);
#endif
return y;
}
cl_ulong genrand_int64( MTdata d)
{
return ((cl_ulong) genrand_int32(d) << 32) | (cl_uint) genrand_int32(d);
}
/* generates a random number on [0,1]-real-interval */
double genrand_real1(MTdata d)
{
return genrand_int32(d)*(1.0/4294967295.0);
/* divided by 2^32-1 */
}
/* generates a random number on [0,1)-real-interval */
double genrand_real2(MTdata d)
{
return genrand_int32(d)*(1.0/4294967296.0);
/* divided by 2^32 */
}
/* generates a random number on (0,1)-real-interval */
double genrand_real3(MTdata d)
{
return (((double)genrand_int32(d)) + 0.5)*(1.0/4294967296.0);
/* divided by 2^32 */
}
/* generates a random number on [0,1) with 53-bit resolution*/
double genrand_res53(MTdata d)
{
unsigned long a=genrand_int32(d)>>5, b=genrand_int32(d)>>6;
return(a*67108864.0+b)*(1.0/9007199254740992.0);
}

View File

@@ -1,175 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "rounding_mode.h"
#if !(defined(_WIN32) && defined(_MSC_VER))
RoundingMode set_round( RoundingMode r, Type outType )
{
static const int flt_rounds[ kRoundingModeCount ] = { FE_TONEAREST, FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO };
static const int int_rounds[ kRoundingModeCount ] = { FE_TOWARDZERO, FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO };
const int *p = int_rounds;
if( outType == kfloat || outType == kdouble )
p = flt_rounds;
int oldRound = fegetround();
fesetround( p[r] );
switch( oldRound )
{
case FE_TONEAREST:
return kRoundToNearestEven;
case FE_UPWARD:
return kRoundUp;
case FE_DOWNWARD:
return kRoundDown;
case FE_TOWARDZERO:
return kRoundTowardZero;
default:
abort(); // ??!
}
return kDefaultRoundingMode; //never happens
}
RoundingMode get_round( void )
{
int oldRound = fegetround();
switch( oldRound )
{
case FE_TONEAREST:
return kRoundToNearestEven;
case FE_UPWARD:
return kRoundUp;
case FE_DOWNWARD:
return kRoundDown;
case FE_TOWARDZERO:
return kRoundTowardZero;
}
return kDefaultRoundingMode;
}
#else
RoundingMode set_round( RoundingMode r, Type outType )
{
static const int flt_rounds[ kRoundingModeCount ] = { _RC_NEAR, _RC_NEAR, _RC_UP, _RC_DOWN, _RC_CHOP };
static const int int_rounds[ kRoundingModeCount ] = { _RC_CHOP, _RC_NEAR, _RC_UP, _RC_DOWN, _RC_CHOP };
const int *p = ( outType == kfloat || outType == kdouble )? flt_rounds : int_rounds;
unsigned int oldRound;
int err = _controlfp_s(&oldRound, 0, 0); //get rounding mode into oldRound
if (err) {
vlog_error("\t\tERROR: -- cannot get rounding mode in %s:%d\n", __FILE__, __LINE__);
return kDefaultRoundingMode; //what else never happens
}
oldRound &= _MCW_RC;
RoundingMode old =
(oldRound == _RC_NEAR)? kRoundToNearestEven :
(oldRound == _RC_UP)? kRoundUp :
(oldRound == _RC_DOWN)? kRoundDown :
(oldRound == _RC_CHOP)? kRoundTowardZero:
kDefaultRoundingMode;
_controlfp_s(&oldRound, p[r], _MCW_RC); //setting new rounding mode
return old; //returning old rounding mode
}
RoundingMode get_round( void )
{
unsigned int oldRound;
int err = _controlfp_s(&oldRound, 0, 0); //get rounding mode into oldRound
oldRound &= _MCW_RC;
return
(oldRound == _RC_NEAR)? kRoundToNearestEven :
(oldRound == _RC_UP)? kRoundUp :
(oldRound == _RC_DOWN)? kRoundDown :
(oldRound == _RC_CHOP)? kRoundTowardZero:
kDefaultRoundingMode;
}
#endif
//
// FlushToZero() sets the host processor into ftz mode. It is intended to have a remote effect on the behavior of the code in
// basic_test_conversions.c. Some host processors may not support this mode, which case you'll need to do some clamping in
// software by testing against FLT_MIN or DBL_MIN in that file.
//
// Note: IEEE-754 says conversions are basic operations. As such they do *NOT* have the behavior in section 7.5.3 of
// the OpenCL spec. They *ALWAYS* flush to zero for subnormal inputs or outputs when FTZ mode is on like other basic
// operators do (e.g. add, subtract, multiply, divide, etc.)
//
// Configuring hardware to FTZ mode varies by platform.
// CAUTION: Some C implementations may also fail to behave properly in this mode.
//
// On PowerPC, it is done by setting the FPSCR into non-IEEE mode.
// On Intel, you can do this by turning on the FZ and DAZ bits in the MXCSR -- provided that SSE/SSE2
// is used for floating point computation! If your OS uses x87, you'll need to figure out how
// to turn that off for the conversions code in basic_test_conversions.c so that they flush to
// zero properly. Otherwise, you'll need to add appropriate software clamping to basic_test_conversions.c
// in which case, these function are at liberty to do nothing.
//
#if defined( __i386__ ) || defined( __x86_64__ ) || defined (_WIN32)
#include <xmmintrin.h>
#elif defined( __PPC__ )
#include <fpu_control.h>
#endif
void *FlushToZero( void )
{
#if defined( __APPLE__ ) || defined(__linux__) || defined (_WIN32)
#if defined( __i386__ ) || defined( __x86_64__ ) || defined(_MSC_VER)
union{ int i; void *p; }u = { _mm_getcsr() };
_mm_setcsr( u.i | 0x8040 );
return u.p;
#elif defined( __arm__ ) || defined(__aarch64__)
// processor is already in FTZ mode -- do nothing
return NULL;
#elif defined( __PPC__ )
fpu_control_t flags = 0;
_FPU_GETCW(flags);
flags |= _FPU_MASK_NI;
_FPU_SETCW(flags);
return NULL;
#else
#error Unknown arch
#endif
#else
#error Please configure FlushToZero and UnFlushToZero to behave properly on this operating system.
#endif
}
// Undo the effects of FlushToZero above, restoring the host to default behavior, using the information passed in p.
void UnFlushToZero( void *p)
{
#if defined( __APPLE__ ) || defined(__linux__) || defined (_WIN32)
#if defined( __i386__ ) || defined( __x86_64__ ) || defined(_MSC_VER)
union{ void *p; int i; }u = { p };
_mm_setcsr( u.i );
#elif defined( __arm__ ) || defined(__aarch64__)
// processor is already in FTZ mode -- do nothing
#elif defined( __PPC__)
fpu_control_t flags = 0;
_FPU_GETCW(flags);
flags &= ~_FPU_MASK_NI;
_FPU_SETCW(flags);
#else
#error Unknown arch
#endif
#else
#error Please configure FlushToZero and UnFlushToZero to behave properly on this operating system.
#endif
}

View File

@@ -1,71 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef __ROUNDING_MODE_H__
#define __ROUNDING_MODE_H__
#include "compat.h"
#include <stdlib.h>
#if (defined(_WIN32) && defined (_MSC_VER))
#include "errorHelpers.h"
#include "testHarness.h"
#endif
typedef enum
{
kDefaultRoundingMode = 0,
kRoundToNearestEven,
kRoundUp,
kRoundDown,
kRoundTowardZero,
kRoundingModeCount
}RoundingMode;
typedef enum
{
kuchar = 0,
kchar = 1,
kushort = 2,
kshort = 3,
kuint = 4,
kint = 5,
kfloat = 6,
kdouble = 7,
kulong = 8,
klong = 9,
//This goes last
kTypeCount
}Type;
#ifdef __cplusplus
extern "C" {
#endif
extern RoundingMode set_round( RoundingMode r, Type outType );
extern RoundingMode get_round( void );
extern void *FlushToZero( void );
extern void UnFlushToZero( void *p);
#ifdef __cplusplus
}
#endif
#endif /* __ROUNDING_MODE_H__ */

View File

@@ -1,106 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "threadTesting.h"
#include "errorHelpers.h"
#include <stdio.h>
#include <stdlib.h>
#if !defined(_WIN32)
#include <stdbool.h>
#endif
#include <math.h>
#include <string.h>
#if !defined(_WIN32)
#include <pthread.h>
#endif
#if 0 // Disabed for now
typedef struct
{
basefn mFunction;
cl_device_id mDevice;
cl_context mContext;
int mNumElements;
} TestFnArgs;
////////////////////////////////////////////////////////////////////////////////
// Thread-based testing. Spawns a new thread to run the given test function,
// then waits for it to complete. The entire idea is that, if the thread crashes,
// we can catch it and report it as a failure instead of crashing the entire suite
////////////////////////////////////////////////////////////////////////////////
void *test_thread_wrapper( void *data )
{
TestFnArgs *args;
int retVal;
cl_context context;
args = (TestFnArgs *)data;
/* Create a new context to use (contexts can't cross threads) */
context = clCreateContext(NULL, args->mDeviceGroup);
if( context == NULL )
{
log_error("clCreateContext failed for new thread\n");
return (void *)(-1);
}
/* Call function */
retVal = args->mFunction( args->mDeviceGroup, args->mDevice, context, args->mNumElements );
clReleaseContext( context );
return (void *)retVal;
}
int test_threaded_function( basefn fnToTest, cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
{
int error;
pthread_t threadHdl;
void *retVal;
TestFnArgs args;
args.mFunction = fnToTest;
args.mDeviceGroup = deviceGroup;
args.mDevice = device;
args.mContext = context;
args.mNumElements = numElements;
error = pthread_create( &threadHdl, NULL, test_thread_wrapper, (void *)&args );
if( error != 0 )
{
log_error( "ERROR: Unable to create thread for testing!\n" );
return -1;
}
/* Thread has been started, now just wait for it to complete (or crash) */
error = pthread_join( threadHdl, &retVal );
if( error != 0 )
{
log_error( "ERROR: Unable to join testing thread!\n" );
return -1;
}
return (int)((intptr_t)retVal);
}
#endif

View File

@@ -22,13 +22,13 @@ set(${MODULE_NAME}_SOURCES
test_kernel_arg_info.c
test_queue_properties.cpp
../../test_common/harness/errorHelpers.c
../../test_common/harness/threadTesting.c
../../../../test_common/harness/threadTesting.c
../../test_common/harness/testHarness.c
../../test_common/harness/kernelHelpers.c
../../../../test_common/harness/typeWrappers.cpp
../../../../test_common/harness/conversions.c
../../test_common/harness/mt19937.c
../../test_common/harness/msvc9.c
../../../../test_common/harness/mt19937.c
../../../../test_common/harness/msvc9.c
../../test_common/harness/imageHelpers.cpp
)

View File

@@ -51,15 +51,15 @@ set(${MODULE_NAME}_SOURCES
test_kernel_call_kernel_function.cpp
test_local_kernel_scope.cpp
../../test_common/harness/errorHelpers.c
../../test_common/harness/threadTesting.c
../../../../test_common/harness/threadTesting.c
../../test_common/harness/testHarness.c
../../test_common/harness/kernelHelpers.c
../../../../test_common/harness/typeWrappers.cpp
../../test_common/harness/imageHelpers.cpp
../../test_common/harness/mt19937.c
../../../../test_common/harness/mt19937.c
../../../../test_common/harness/conversions.c
../../test_common/harness/rounding_mode.c
../../test_common/harness/msvc9.c
../../../../test_common/harness/rounding_mode.c
../../../../test_common/harness/msvc9.c
)
include(../../../CMakeCommon.txt)