mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
Remove almost duplicate compatibility common code
Use the non-compatibility version. In each case the diff was minimal, didn't have modifications that would invalidate compatibility testing and it was clear that the "latest/best" version was not the one in the compatibility copy. Signed-off-by: Kevin Petit <kevin.petit@arm.com>
This commit is contained in:
@@ -446,6 +446,7 @@ void ThreadPool_Init(void)
|
||||
// Check for manual override of multithreading code. We add this for better debuggability.
|
||||
if( getenv( "CL_TEST_SINGLE_THREADED" ) )
|
||||
{
|
||||
log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n*** TEST IS INVALID! ***\n");
|
||||
gThreadCount = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,899 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "ThreadPool.h"
|
||||
#include "errorHelpers.h"
|
||||
#include "fpcontrol.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined( __APPLE__ ) || defined( __linux__ ) || defined( _WIN32 ) // or any other POSIX system
|
||||
|
||||
#if defined( _WIN32 )
|
||||
#include <windows.h>
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
#include "mingw_compat.h"
|
||||
#include <process.h>
|
||||
#else // !_WIN32
|
||||
#include <pthread.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/errno.h>
|
||||
#endif // !_WIN32
|
||||
|
||||
// declarations
|
||||
#ifdef _WIN32
|
||||
void ThreadPool_WorkerFunc( void *p );
|
||||
#else
|
||||
void *ThreadPool_WorkerFunc( void *p );
|
||||
#endif
|
||||
void ThreadPool_Init(void);
|
||||
void ThreadPool_Exit(void);
|
||||
|
||||
#if defined (__MINGW32__)
|
||||
// Mutex for implementing super heavy atomic operations if you don't have GCC or MSVC
|
||||
CRITICAL_SECTION gAtomicLock;
|
||||
#elif defined( __GNUC__ ) || defined( _MSC_VER)
|
||||
#else
|
||||
pthread_mutex_t gAtomicLock;
|
||||
#endif
|
||||
|
||||
// Atomic add operator with mem barrier. Mem barrier needed to protect state modified by the worker functions.
|
||||
cl_int ThreadPool_AtomicAdd( volatile cl_int *a, cl_int b )
|
||||
{
|
||||
#if defined (__MINGW32__)
|
||||
// No atomics on Mingw32
|
||||
EnterCriticalSection(&gAtomicLock);
|
||||
cl_int old = *a;
|
||||
*a = old + b;
|
||||
LeaveCriticalSection(&gAtomicLock);
|
||||
return old;
|
||||
#elif defined( __GNUC__ )
|
||||
// GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
|
||||
return __sync_fetch_and_add( a, b );
|
||||
// do we need __sync_synchronize() here, too? GCC docs are unclear whether __sync_fetch_and_add does a synchronize
|
||||
#elif defined( _MSC_VER )
|
||||
return (cl_int) _InterlockedExchangeAdd( (volatile LONG*) a, (LONG) b );
|
||||
#else
|
||||
#warning Please add a atomic add implementation here, with memory barrier. Fallback code is slow.
|
||||
if( pthread_mutex_lock(&gAtomicLock) )
|
||||
log_error( "Atomic operation failed. pthread_mutex_lock(&gAtomicLock) returned an error\n");
|
||||
cl_int old = *a;
|
||||
*a = old + b;
|
||||
if( pthread_mutex_unlock(&gAtomicLock) )
|
||||
log_error( "Failed to release gAtomicLock. Further atomic operations may deadlock!\n");
|
||||
return old;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined( _WIN32 )
|
||||
// Uncomment the following line if Windows XP support is not required.
|
||||
// #define HAS_INIT_ONCE_EXECUTE_ONCE 1
|
||||
|
||||
#if defined(HAS_INIT_ONCE_EXECUTE_ONCE)
|
||||
#define _INIT_ONCE INIT_ONCE
|
||||
#define _PINIT_ONCE PINIT_ONCE
|
||||
#define _InitOnceExecuteOnce InitOnceExecuteOnce
|
||||
#else // !HAS_INIT_ONCE_EXECUTE_ONCE
|
||||
|
||||
typedef volatile LONG _INIT_ONCE;
|
||||
typedef _INIT_ONCE *_PINIT_ONCE;
|
||||
typedef BOOL (CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *);
|
||||
|
||||
#define _INIT_ONCE_UNINITIALIZED 0
|
||||
#define _INIT_ONCE_IN_PROGRESS 1
|
||||
#define _INIT_ONCE_DONE 2
|
||||
|
||||
static BOOL _InitOnceExecuteOnce(
|
||||
_PINIT_ONCE InitOnce,
|
||||
_PINIT_ONCE_FN InitFn,
|
||||
PVOID Parameter,
|
||||
LPVOID *Context
|
||||
)
|
||||
{
|
||||
while ( *InitOnce != _INIT_ONCE_DONE )
|
||||
{
|
||||
if (*InitOnce != _INIT_ONCE_IN_PROGRESS && _InterlockedCompareExchange( InitOnce, _INIT_ONCE_IN_PROGRESS, _INIT_ONCE_UNINITIALIZED ) == _INIT_ONCE_UNINITIALIZED )
|
||||
{
|
||||
InitFn( InitOnce, Parameter, Context );
|
||||
*InitOnce = _INIT_ONCE_DONE;
|
||||
return TRUE;
|
||||
}
|
||||
Sleep( 1 );
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
#endif // !HAS_INIT_ONCE_EXECUTE_ONCE
|
||||
|
||||
// Uncomment the following line if Windows XP support is not required.
|
||||
// #define HAS_CONDITION_VARIABLE 1
|
||||
|
||||
#if defined(HAS_CONDITION_VARIABLE)
|
||||
#define _CONDITION_VARIABLE CONDITION_VARIABLE
|
||||
#define _InitializeConditionVariable InitializeConditionVariable
|
||||
#define _SleepConditionVariableCS SleepConditionVariableCS
|
||||
#define _WakeAllConditionVariable WakeAllConditionVariable
|
||||
#else // !HAS_CONDITION_VARIABLE
|
||||
typedef struct
|
||||
{
|
||||
HANDLE mEvent; // Used to park the thread.
|
||||
CRITICAL_SECTION mLock[1]; // Used to protect mWaiters, mGeneration and mReleaseCount.
|
||||
volatile cl_int mWaiters; // Number of threads waiting on this cond var.
|
||||
volatile cl_int mGeneration; // Wait generation count.
|
||||
volatile cl_int mReleaseCount; // Number of releases to execute before reseting the event.
|
||||
} _CONDITION_VARIABLE;
|
||||
|
||||
typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE;
|
||||
|
||||
static void _InitializeConditionVariable( _PCONDITION_VARIABLE cond_var )
|
||||
{
|
||||
cond_var->mEvent = CreateEvent( NULL, TRUE, FALSE, NULL );
|
||||
InitializeCriticalSection( cond_var->mLock );
|
||||
cond_var->mWaiters = 0;
|
||||
cond_var->mGeneration = 0;
|
||||
#if !defined ( NDEBUG )
|
||||
cond_var->mReleaseCount = 0;
|
||||
#endif // !NDEBUG
|
||||
}
|
||||
|
||||
static void _SleepConditionVariableCS( _PCONDITION_VARIABLE cond_var, PCRITICAL_SECTION cond_lock, DWORD ignored)
|
||||
{
|
||||
EnterCriticalSection( cond_var->mLock );
|
||||
cl_int generation = cond_var->mGeneration;
|
||||
++cond_var->mWaiters;
|
||||
LeaveCriticalSection( cond_var->mLock );
|
||||
LeaveCriticalSection( cond_lock );
|
||||
|
||||
while ( TRUE )
|
||||
{
|
||||
WaitForSingleObject( cond_var->mEvent, INFINITE );
|
||||
EnterCriticalSection( cond_var->mLock );
|
||||
BOOL done = cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation;
|
||||
LeaveCriticalSection( cond_var->mLock );
|
||||
if ( done )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
EnterCriticalSection( cond_lock );
|
||||
EnterCriticalSection( cond_var->mLock );
|
||||
if ( --cond_var->mReleaseCount == 0 )
|
||||
{
|
||||
ResetEvent( cond_var->mEvent );
|
||||
}
|
||||
--cond_var->mWaiters;
|
||||
LeaveCriticalSection( cond_var->mLock );
|
||||
}
|
||||
|
||||
static void _WakeAllConditionVariable( _PCONDITION_VARIABLE cond_var )
|
||||
{
|
||||
EnterCriticalSection( cond_var->mLock );
|
||||
if (cond_var->mWaiters > 0 )
|
||||
{
|
||||
++cond_var->mGeneration;
|
||||
cond_var->mReleaseCount = cond_var->mWaiters;
|
||||
SetEvent( cond_var->mEvent );
|
||||
}
|
||||
LeaveCriticalSection( cond_var->mLock );
|
||||
}
|
||||
#endif // !HAS_CONDITION_VARIABLE
|
||||
#endif // _WIN32
|
||||
|
||||
#define MAX_COUNT (1<<29)
|
||||
|
||||
// Global state to coordinate whether the threads have been launched successfully or not
|
||||
#if defined( _MSC_VER ) && (_WIN32_WINNT >= 0x600)
|
||||
static _INIT_ONCE threadpool_init_control;
|
||||
#elif defined (_WIN32) // MingW of XP
|
||||
static int threadpool_init_control;
|
||||
#else // Posix platforms
|
||||
pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT;
|
||||
#endif
|
||||
cl_int threadPoolInitErr = -1; // set to CL_SUCCESS on successful thread launch
|
||||
|
||||
// critical region lock around ThreadPool_Do. We can only run one ThreadPool_Do at a time,
|
||||
// because we are too lazy to set up a queue here, and don't expect to need one.
|
||||
#if defined( _WIN32 )
|
||||
CRITICAL_SECTION gThreadPoolLock[1];
|
||||
#else // !_WIN32
|
||||
pthread_mutex_t gThreadPoolLock;
|
||||
#endif // !_WIN32
|
||||
|
||||
// Condition variable to park ThreadPool threads when not working
|
||||
#if defined( _WIN32 )
|
||||
CRITICAL_SECTION cond_lock[1];
|
||||
_CONDITION_VARIABLE cond_var[1];
|
||||
#else // !_WIN32
|
||||
pthread_mutex_t cond_lock;
|
||||
pthread_cond_t cond_var;
|
||||
#endif // !_WIN32
|
||||
volatile cl_int gRunCount = 0; // Condition variable state. How many iterations on the function left to run.
|
||||
// set to CL_INT_MAX to cause worker threads to exit. Note: this value might go negative.
|
||||
|
||||
// State that only changes when the threadpool is not working.
|
||||
volatile TPFuncPtr gFunc_ptr = NULL;
|
||||
volatile void *gUserInfo = NULL;
|
||||
volatile cl_int gJobCount = 0;
|
||||
|
||||
// State that may change while the thread pool is working
|
||||
volatile cl_int jobError = CL_SUCCESS; // err code return for the job as a whole
|
||||
|
||||
// Condition variable to park caller while waiting
|
||||
#if defined( _WIN32 )
|
||||
HANDLE caller_event;
|
||||
#else // !_WIN32
|
||||
pthread_mutex_t caller_cond_lock;
|
||||
pthread_cond_t caller_cond_var;
|
||||
#endif // !_WIN32
|
||||
volatile cl_int gRunning = 0; // # of threads intended to be running. Running threads will decrement this as they discover they've run out of work to do.
|
||||
|
||||
// The total number of threads launched.
|
||||
volatile cl_int gThreadCount = 0;
|
||||
#ifdef _WIN32
|
||||
void ThreadPool_WorkerFunc( void *p )
|
||||
#else
|
||||
void *ThreadPool_WorkerFunc( void *p )
|
||||
#endif
|
||||
{
|
||||
cl_uint threadID = ThreadPool_AtomicAdd( (volatile cl_int *) p, 1 );
|
||||
cl_int item = ThreadPool_AtomicAdd( &gRunCount, -1 );
|
||||
// log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );
|
||||
|
||||
while( MAX_COUNT > item )
|
||||
{
|
||||
cl_int err;
|
||||
|
||||
// check for more work to do
|
||||
if( 0 >= item )
|
||||
{
|
||||
// log_info( "Thread %d has run out of work.\n", threadID );
|
||||
|
||||
// No work to do. Attempt to block waiting for work
|
||||
#if defined( _WIN32 )
|
||||
EnterCriticalSection( cond_lock );
|
||||
#else // !_WIN32
|
||||
if((err = pthread_mutex_lock( &cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_lock. Worker %d unable to block waiting for work. ThreadPool_WorkerFunc failed.\n", err, threadID );
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
cl_int remaining = ThreadPool_AtomicAdd( &gRunning, -1 );
|
||||
// log_info( "ThreadPool_WorkerFunc: gRunning = %d\n", remaining - 1 );
|
||||
if( 1 == remaining )
|
||||
{ // last thread out signal the main thread to wake up
|
||||
#if defined( _WIN32 )
|
||||
SetEvent( caller_event );
|
||||
#else // !_WIN32
|
||||
if((err = pthread_mutex_lock( &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_lock. Unable to wake caller.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
if( (err = pthread_cond_broadcast( &caller_cond_var )))
|
||||
{
|
||||
log_error("Error %d from pthread_cond_broadcast. Unable to wake up main thread. ThreadPool_WorkerFunc failed.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
if((err = pthread_mutex_unlock( &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_lock. Unable to wake caller.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
}
|
||||
|
||||
// loop in case we are woken only to discover that some other thread already did all the work
|
||||
while( 0 >= item )
|
||||
{
|
||||
#if defined( _WIN32 )
|
||||
_SleepConditionVariableCS( cond_var, cond_lock, INFINITE );
|
||||
#else // !_WIN32
|
||||
if((err = pthread_cond_wait( &cond_var, &cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_cond_wait. Unable to block for waiting for work. ThreadPool_WorkerFunc failed.\n", err );
|
||||
pthread_mutex_unlock( &cond_lock);
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
// try again to get a valid item id
|
||||
item = ThreadPool_AtomicAdd( &gRunCount, -1 );
|
||||
if( MAX_COUNT <= item ) // exit if we are done
|
||||
{
|
||||
#if defined( _WIN32 )
|
||||
LeaveCriticalSection( cond_lock );
|
||||
#else // !_WIN32
|
||||
pthread_mutex_unlock( &cond_lock);
|
||||
#endif // !_WIN32
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
ThreadPool_AtomicAdd( &gRunning, 1 );
|
||||
// log_info( "Thread %d has found work.\n", threadID);
|
||||
|
||||
#if defined( _WIN32 )
|
||||
LeaveCriticalSection( cond_lock );
|
||||
#else // !_WIN32
|
||||
if((err = pthread_mutex_unlock( &cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_unlock. Unable to block for waiting for work. ThreadPool_WorkerFunc failed.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
}
|
||||
|
||||
// we have a valid item, so do the work
|
||||
if( CL_SUCCESS == jobError ) // but only if we haven't already encountered an error
|
||||
{
|
||||
// log_info( "Thread %d doing job %d\n", threadID, item - 1);
|
||||
|
||||
#if defined(__APPLE__) && defined(__arm__)
|
||||
// On most platforms which support denorm, default is FTZ off. However,
|
||||
// on some hardware where the reference is computed, default might be flush denorms to zero e.g. arm.
|
||||
// This creates issues in result verification. Since spec allows the implementation to either flush or
|
||||
// not flush denorms to zero, an implementation may choose not be flush i.e. return denorm result whereas
|
||||
// reference result may be zero (flushed denorm). Hence we need to disable denorm flushing on host side
|
||||
// where reference is being computed to make sure we get non-flushed reference result. If implementation
|
||||
// returns flushed result, we correctly take care of that in verification code.
|
||||
FPU_mode_type oldMode;
|
||||
DisableFTZ( &oldMode );
|
||||
#endif
|
||||
|
||||
// Call the user's function with this item ID
|
||||
err = gFunc_ptr( item - 1, threadID, (void*) gUserInfo );
|
||||
#if defined(__APPLE__) && defined(__arm__)
|
||||
// Restore FP state
|
||||
RestoreFPState( &oldMode );
|
||||
#endif
|
||||
|
||||
if( err )
|
||||
{
|
||||
#if (__MINGW32__)
|
||||
EnterCriticalSection(&gAtomicLock);
|
||||
if( jobError == CL_SUCCESS );
|
||||
jobError = err;
|
||||
gRunCount = 0;
|
||||
LeaveCriticalSection(&gAtomicLock);
|
||||
#elif defined( __GNUC__ )
|
||||
// GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
|
||||
// set the new error if we are the first one there.
|
||||
__sync_val_compare_and_swap( &jobError, CL_SUCCESS, err );
|
||||
|
||||
// drop run count to 0
|
||||
gRunCount = 0;
|
||||
__sync_synchronize();
|
||||
#elif defined( _MSC_VER )
|
||||
// set the new error if we are the first one there.
|
||||
_InterlockedCompareExchange( (volatile LONG*) &jobError, err, CL_SUCCESS );
|
||||
|
||||
// drop run count to 0
|
||||
gRunCount = 0;
|
||||
_mm_mfence();
|
||||
#else
|
||||
if( pthread_mutex_lock(&gAtomicLock) )
|
||||
log_error( "Atomic operation failed. pthread_mutex_lock(&gAtomicLock) returned an error\n");
|
||||
if( jobError == CL_SUCCESS );
|
||||
jobError = err;
|
||||
gRunCount = 0;
|
||||
if( pthread_mutex_unlock(&gAtomicLock) )
|
||||
log_error( "Failed to release gAtomicLock. Further atomic operations may deadlock\n");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// get the next item
|
||||
item = ThreadPool_AtomicAdd( &gRunCount, -1 );
|
||||
}
|
||||
|
||||
exit:
|
||||
log_info( "ThreadPool: thread %d exiting.\n", threadID );
|
||||
ThreadPool_AtomicAdd( &gThreadCount, -1 );
|
||||
#if !defined(_WIN32)
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
// SetThreadCount() may be used to artifically set the number of worker threads
|
||||
// If the value is 0 (the default) the number of threads will be determined based on
|
||||
// the number of CPU cores. If it is a unicore machine, then 2 will be used, so
|
||||
// that we still get some testing for thread safety.
|
||||
//
|
||||
// If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then the
|
||||
// code will run single threaded, but will report an error to indicate that the test
|
||||
// is invalid. This option is intended for debugging purposes only. It is suggested
|
||||
// as a convention that test apps set the thread count to 1 in response to the -m flag.
|
||||
//
|
||||
// SetThreadCount() must be called before the first call to GetThreadCount() or ThreadPool_Do(),
|
||||
// otherwise the behavior is indefined.
|
||||
void SetThreadCount( int count )
|
||||
{
|
||||
if( threadPoolInitErr == CL_SUCCESS )
|
||||
{
|
||||
log_error( "Error: It is illegal to set the thread count after the first call to ThreadPool_Do or GetThreadCount\n" );
|
||||
abort();
|
||||
}
|
||||
|
||||
gThreadCount = count;
|
||||
}
|
||||
|
||||
void ThreadPool_Init(void)
|
||||
{
|
||||
cl_int i;
|
||||
int err;
|
||||
volatile cl_uint threadID = 0;
|
||||
|
||||
// Check for manual override of multithreading code. We add this for better debuggability.
|
||||
if( getenv( "CL_TEST_SINGLE_THREADED" ) )
|
||||
{
|
||||
log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. Running single threaded.\n*** TEST IS INVALID! ***\n");
|
||||
gThreadCount = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
// Figure out how many threads to run -- check first for non-zero to give the implementation the chance
|
||||
if( 0 == gThreadCount )
|
||||
{
|
||||
#if defined(_MSC_VER) || defined (__MINGW64__)
|
||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
|
||||
DWORD length = 0;
|
||||
|
||||
GetLogicalProcessorInformation( NULL, &length );
|
||||
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( length );
|
||||
if( buffer != NULL && GetLogicalProcessorInformation( buffer, &length ) == TRUE )
|
||||
{
|
||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
|
||||
while( ptr < &buffer[ length / sizeof( SYSTEM_LOGICAL_PROCESSOR_INFORMATION ) ] )
|
||||
{
|
||||
if( ptr->Relationship == RelationProcessorCore )
|
||||
{
|
||||
// Count the number of bits in ProcessorMask (number of logical cores)
|
||||
ULONG mask = ptr->ProcessorMask;
|
||||
while( mask )
|
||||
{
|
||||
++gThreadCount;
|
||||
mask &= mask - 1; // Remove 1 bit at a time
|
||||
}
|
||||
}
|
||||
++ptr;
|
||||
}
|
||||
free(buffer);
|
||||
}
|
||||
#elif defined (__MINGW32__)
|
||||
{
|
||||
#warning How about this, instead of hard coding it to 2?
|
||||
SYSTEM_INFO sysinfo;
|
||||
GetSystemInfo( &sysinfo );
|
||||
gThreadCount = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
#else // !_WIN32
|
||||
gThreadCount = (cl_int) sysconf(_SC_NPROCESSORS_CONF); // Hopefully your system returns logical cpus here, as does MacOS X
|
||||
#endif // !_WIN32
|
||||
|
||||
// Multithreaded tests are required to run multithreaded even on unicore systems so as to test thread safety
|
||||
if( 1 == gThreadCount )
|
||||
gThreadCount = 2;
|
||||
}
|
||||
|
||||
//Allow the app to set thread count to <0 for debugging purposes. This will cause the test to run single threaded.
|
||||
if( gThreadCount < 2 )
|
||||
{
|
||||
log_error( "ERROR: Running single threaded because thread count < 2. \n*** TEST IS INVALID! ***\n");
|
||||
gThreadCount = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined( _WIN32 )
|
||||
InitializeCriticalSection( gThreadPoolLock );
|
||||
InitializeCriticalSection( cond_lock );
|
||||
_InitializeConditionVariable( cond_var );
|
||||
caller_event = CreateEvent( NULL, FALSE, FALSE, NULL );
|
||||
#elif defined (__GNUC__)
|
||||
// Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since it might cause problem
|
||||
// with some flavors of gcc compilers.
|
||||
pthread_cond_init(&cond_var, NULL);
|
||||
pthread_mutex_init(&cond_lock ,NULL);
|
||||
pthread_cond_init(&caller_cond_var, NULL);
|
||||
pthread_mutex_init(&caller_cond_lock, NULL);
|
||||
pthread_mutex_init(&gThreadPoolLock, NULL);
|
||||
#endif
|
||||
|
||||
#if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__))
|
||||
pthread_mutex_initialize(gAtomicLock);
|
||||
#elif defined (__MINGW32__)
|
||||
InitializeCriticalSection(&gAtomicLock);
|
||||
#endif
|
||||
// Make sure the last thread done in the work pool doesn't signal us to wake before we get to the point where we are supposed to wait
|
||||
// That would cause a deadlock.
|
||||
#if !defined( _WIN32 )
|
||||
if((err = pthread_mutex_lock( &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_lock. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
|
||||
gThreadCount = 1;
|
||||
return;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
gRunning = gThreadCount;
|
||||
// init threads
|
||||
for( i = 0; i < gThreadCount; i++ )
|
||||
{
|
||||
#if defined( _WIN32 )
|
||||
uintptr_t handle = _beginthread(ThreadPool_WorkerFunc, 0, (void*) &threadID);
|
||||
err = ( handle == 0 );
|
||||
#else // !_WIN32
|
||||
pthread_t tid = 0;
|
||||
err = pthread_create( &tid, NULL, ThreadPool_WorkerFunc, (void*) &threadID );
|
||||
#endif // !_WIN32
|
||||
if( err )
|
||||
{
|
||||
log_error( "Error %d launching thread %d\n", err, i );
|
||||
threadPoolInitErr = err;
|
||||
gThreadCount = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
atexit( ThreadPool_Exit );
|
||||
|
||||
// block until they are done launching.
|
||||
do
|
||||
{
|
||||
#if defined( _WIN32 )
|
||||
WaitForSingleObject( caller_event, INFINITE );
|
||||
#else // !_WIN32
|
||||
if((err = pthread_cond_wait( &caller_cond_var, &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_cond_wait. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
|
||||
pthread_mutex_unlock( &caller_cond_lock);
|
||||
return;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
}
|
||||
while( gRunCount != -gThreadCount );
|
||||
#if !defined( _WIN32 )
|
||||
if((err = pthread_mutex_unlock( &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_unlock. Unable to block for work to finish. ThreadPool_Init failed.\n", err );
|
||||
return;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
threadPoolInitErr = CL_SUCCESS;
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter, PVOID *lpContex)
|
||||
{
|
||||
ThreadPool_Init();
|
||||
return TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ThreadPool_Exit(void)
|
||||
{
|
||||
int err, count;
|
||||
gRunCount = CL_INT_MAX;
|
||||
|
||||
#if defined( __GNUC__ )
|
||||
// GCC extension: http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
|
||||
__sync_synchronize();
|
||||
#elif defined( _MSC_VER )
|
||||
_mm_mfence();
|
||||
#else
|
||||
#warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
|
||||
#endif
|
||||
|
||||
// spin waiting for threads to die
|
||||
for (count = 0; 0 != gThreadCount && count < 1000; count++)
|
||||
{
|
||||
#if defined( _WIN32 )
|
||||
_WakeAllConditionVariable( cond_var );
|
||||
Sleep(1);
|
||||
#else // !_WIN32
|
||||
if( (err = pthread_cond_broadcast( &cond_var )))
|
||||
{
|
||||
log_error("Error %d from pthread_cond_broadcast. Unable to wake up work threads. ThreadPool_Exit failed.\n", err );
|
||||
break;
|
||||
}
|
||||
usleep(1000);
|
||||
#endif // !_WIN32
|
||||
}
|
||||
|
||||
if( gThreadCount )
|
||||
log_error( "Error: Thread pool timed out after 1 second with %d threads still active.\n", gThreadCount );
|
||||
else
|
||||
log_info( "Thread pool exited in a orderly fashion.\n" );
|
||||
}
|
||||
|
||||
|
||||
// Blocking API that farms out count jobs to a thread pool.
|
||||
// It may return with some work undone if func_ptr() returns a non-zero
|
||||
// result.
|
||||
//
|
||||
// This function obviously has its shortcommings. Only one call to ThreadPool_Do
|
||||
// can be running at a time. It is not intended for general purpose use.
|
||||
// If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were
|
||||
// all available then it would make more sense to use those features.
|
||||
cl_int ThreadPool_Do( TPFuncPtr func_ptr,
|
||||
cl_uint count,
|
||||
void *userInfo )
|
||||
{
|
||||
cl_int newErr;
|
||||
cl_int err = 0;
|
||||
// Lazily set up our threads
|
||||
#if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
|
||||
err = !_InitOnceExecuteOnce( &threadpool_init_control, _ThreadPool_Init, NULL, NULL );
|
||||
#elif defined (_WIN32)
|
||||
if (threadpool_init_control == 0) {
|
||||
#warning This is buggy and race prone. Find a better way.
|
||||
ThreadPool_Init();
|
||||
threadpool_init_control = 1;
|
||||
}
|
||||
#else //posix platform
|
||||
err = pthread_once( &threadpool_init_control, ThreadPool_Init );
|
||||
if( err )
|
||||
{
|
||||
log_error("Error %d from pthread_once. Unable to init threads. ThreadPool_Do failed.\n", err );
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
// Single threaded code to handle case where threadpool wasn't allocated or was disabled by environment variable
|
||||
if( threadPoolInitErr )
|
||||
{
|
||||
cl_uint currentJob = 0;
|
||||
cl_int result = CL_SUCCESS;
|
||||
|
||||
#if defined(__APPLE__) && defined(__arm__)
|
||||
// On most platforms which support denorm, default is FTZ off. However,
|
||||
// on some hardware where the reference is computed, default might be flush denorms to zero e.g. arm.
|
||||
// This creates issues in result verification. Since spec allows the implementation to either flush or
|
||||
// not flush denorms to zero, an implementation may choose not be flush i.e. return denorm result whereas
|
||||
// reference result may be zero (flushed denorm). Hence we need to disable denorm flushing on host side
|
||||
// where reference is being computed to make sure we get non-flushed reference result. If implementation
|
||||
// returns flushed result, we correctly take care of that in verification code.
|
||||
FPU_mode_type oldMode;
|
||||
DisableFTZ( &oldMode );
|
||||
#endif
|
||||
for( currentJob = 0; currentJob < count; currentJob++ )
|
||||
if((result = func_ptr( currentJob, 0, userInfo )))
|
||||
{
|
||||
#if defined(__APPLE__) && defined(__arm__)
|
||||
// Restore FP state before leaving
|
||||
RestoreFPState( &oldMode );
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) && defined(__arm__)
|
||||
// Restore FP state before leaving
|
||||
RestoreFPState( &oldMode );
|
||||
#endif
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
if( count >= MAX_COUNT )
|
||||
{
|
||||
log_error("Error: ThreadPool_Do count %d >= max threadpool count of %d\n", count, MAX_COUNT );
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Enter critical region
|
||||
#if defined( _WIN32 )
|
||||
EnterCriticalSection( gThreadPoolLock );
|
||||
#else // !_WIN32
|
||||
if( (err = pthread_mutex_lock( &gThreadPoolLock )))
|
||||
{
|
||||
switch (err)
|
||||
{
|
||||
case EDEADLK:
|
||||
log_error("Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do is not designed to work recursively!\n" );
|
||||
break;
|
||||
case EINVAL:
|
||||
log_error("Error EINVAL returned in ThreadPool_Do(). How did we end up with an invalid gThreadPoolLock?\n" );
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
// Start modifying the job state observable by worker threads
|
||||
#if defined( _WIN32 )
|
||||
EnterCriticalSection( cond_lock );
|
||||
#else // !_WIN32
|
||||
if((err = pthread_mutex_lock( &cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_lock. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
// Make sure the last thread done in the work pool doesn't signal us to wake before we get to the point where we are supposed to wait
|
||||
// That would cause a deadlock.
|
||||
#if !defined( _WIN32 )
|
||||
if((err = pthread_mutex_lock( &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_lock. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
// Prime the worker threads to get going
|
||||
jobError = CL_SUCCESS;
|
||||
gRunCount = gJobCount = count;
|
||||
gFunc_ptr = func_ptr;
|
||||
gUserInfo = userInfo;
|
||||
|
||||
#if defined( _WIN32 )
|
||||
_WakeAllConditionVariable( cond_var );
|
||||
LeaveCriticalSection( cond_lock );
|
||||
#else // !_WIN32
|
||||
if( (err = pthread_cond_broadcast( &cond_var )))
|
||||
{
|
||||
log_error("Error %d from pthread_cond_broadcast. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
if((err = pthread_mutex_unlock( &cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_unlock. Unable to wake up work threads. ThreadPool_Do failed.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
// block until they are done. It would be slightly more efficient to do some of the work here though.
|
||||
do
|
||||
{
|
||||
#if defined( _WIN32 )
|
||||
WaitForSingleObject( caller_event, INFINITE );
|
||||
#else // !_WIN32
|
||||
if((err = pthread_cond_wait( &caller_cond_var, &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_cond_wait. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
|
||||
pthread_mutex_unlock( &caller_cond_lock);
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
}
|
||||
while( gRunning );
|
||||
#if !defined(_WIN32)
|
||||
if((err = pthread_mutex_unlock( &caller_cond_lock) ))
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_unlock. Unable to block for work to finish. ThreadPool_Do failed.\n", err );
|
||||
goto exit;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
err = jobError;
|
||||
|
||||
exit:
|
||||
// exit critical region
|
||||
#if defined( _WIN32 )
|
||||
LeaveCriticalSection( gThreadPoolLock );
|
||||
#else // !_WIN32
|
||||
newErr = pthread_mutex_unlock( &gThreadPoolLock );
|
||||
if( newErr)
|
||||
{
|
||||
log_error("Error %d from pthread_mutex_unlock. Unable to exit critical region. ThreadPool_Do failed.\n", newErr );
|
||||
return err;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
cl_uint GetThreadCount( void )
|
||||
{
|
||||
// Lazily set up our threads
|
||||
#if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
|
||||
cl_int err = !_InitOnceExecuteOnce( &threadpool_init_control, _ThreadPool_Init, NULL, NULL );
|
||||
#elif defined (_WIN32)
|
||||
if (threadpool_init_control == 0) {
|
||||
#warning This is buggy and race prone. Find a better way.
|
||||
ThreadPool_Init();
|
||||
threadpool_init_control = 1;
|
||||
}
|
||||
#else
|
||||
cl_int err = pthread_once( &threadpool_init_control, ThreadPool_Init );
|
||||
if( err )
|
||||
{
|
||||
log_error("Error %d from pthread_once. Unable to init threads. ThreadPool_Do failed.\n", err );
|
||||
return err;
|
||||
}
|
||||
#endif // !_WIN32
|
||||
|
||||
if( gThreadCount < 1 )
|
||||
return 1;
|
||||
|
||||
return gThreadCount;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
|
||||
#error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section.
|
||||
#endif
|
||||
//
|
||||
// We require multithreading in parts of the test as a means of simultaneously testing reentrancy requirements
|
||||
// of OpenCL API, while also checking
|
||||
//
|
||||
// A sample single threaded implementation follows, for documentation / bootstrapping purposes.
|
||||
// It is not okay to use this for conformance testing!!!
|
||||
//
|
||||
// Exception: If your operating system does not support multithreaded execution of any kind, then you may use this code.
|
||||
//
|
||||
|
||||
cl_int ThreadPool_AtomicAdd( volatile cl_int *a, cl_int b )
|
||||
{
|
||||
cl_uint r = *a;
|
||||
|
||||
// since this fallback code path is not multithreaded, we just do a regular add here
|
||||
// If your operating system supports memory-barrier-atomics, use those here
|
||||
*a = r + b;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// Blocking API that farms out count jobs to a thread pool.
|
||||
// It may return with some work undone if func_ptr() returns a non-zero
|
||||
// result.
|
||||
cl_int ThreadPool_Do( TPFuncPtr func_ptr,
|
||||
cl_uint count,
|
||||
void *userInfo )
|
||||
{
|
||||
cl_uint currentJob = 0;
|
||||
cl_int result = CL_SUCCESS;
|
||||
|
||||
#ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
|
||||
// THIS FUNCTION IS NOT INTENDED FOR USE!!
|
||||
log_error( "ERROR: Test must be multithreaded!\n" );
|
||||
exit(-1);
|
||||
#else
|
||||
static int spewCount = 0;
|
||||
|
||||
if( 0 == spewCount )
|
||||
{
|
||||
log_info( "\nWARNING: The operating system is claimed not to support threads of any sort. Running single threaded.\n" );
|
||||
spewCount = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
// The multithreaded code should mimic this behavior:
|
||||
for( currentJob = 0; currentJob < count; currentJob++ )
|
||||
if((result = func_ptr( currentJob, 0, userInfo )))
|
||||
return result;
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
cl_uint GetThreadCount( void )
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
void SetThreadCount( int count )
|
||||
{
|
||||
if( count > 1 )
|
||||
log_info( "WARNING: SetThreadCount(%d) ignored\n", count );
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,393 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
/*
|
||||
Header compat.h should be used instead of stdlib.h, stdbool.h, stdint.h, float.h, fenv.h,
|
||||
math.h. It provides workarounds if these headers are not available or not complete.
|
||||
|
||||
Important: It should be included before math.h, directly or indirectly, because Intel mathimf.h
|
||||
is not compatible with Microsoft math.h. Including math.h before mathimf.h causes compile-time
|
||||
error.
|
||||
*/
|
||||
#ifndef _COMPAT_H_
|
||||
#define _COMPAT_H_
|
||||
|
||||
#if defined(_WIN32) && defined (_MSC_VER)
|
||||
#include <Windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define EXTERN_C extern "C"
|
||||
#else
|
||||
#define EXTERN_C
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// stdlib.h
|
||||
//
|
||||
|
||||
#include <stdlib.h> // On Windows, _MAX_PATH defined there.
|
||||
|
||||
// llabs appeared in MS C v16 (VS 10/2010).
|
||||
#if defined( _MSC_VER ) && _MSC_VER <= 1500
|
||||
EXTERN_C inline long long llabs(long long __x) { return __x >= 0 ? __x : -__x; }
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// stdbool.h
|
||||
//
|
||||
|
||||
// stdbool.h appeared in MS C v18 (VS 12/2013).
|
||||
#if defined( _MSC_VER ) && MSC_VER <= 1700
|
||||
#if !defined(__cplusplus)
|
||||
typedef char bool;
|
||||
#define true 1
|
||||
#define false 0
|
||||
#endif
|
||||
#else
|
||||
#include <stdbool.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//
|
||||
// stdint.h
|
||||
//
|
||||
|
||||
// stdint.h appeared in MS C v16 (VS 10/2010) and Intel C v12.
|
||||
#if defined( _MSC_VER ) && ( ! defined( __INTEL_COMPILER ) && _MSC_VER <= 1500 || defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 1200 )
|
||||
typedef unsigned char uint8_t;
|
||||
typedef char int8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef short int16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef int int32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
typedef long long int64_t;
|
||||
#else
|
||||
#ifndef __STDC_LIMIT_MACROS
|
||||
#define __STDC_LIMIT_MACROS
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//
|
||||
// float.h
|
||||
//
|
||||
|
||||
#include <float.h>
|
||||
|
||||
|
||||
|
||||
//
|
||||
// fenv.h
|
||||
//
|
||||
|
||||
// fenv.h appeared in MS C v18 (VS 12/2013).
|
||||
#if defined( _MSC_VER ) && _MSC_VER <= 1700 && ! defined( __INTEL_COMPILER )
|
||||
// reimplement fenv.h because windows doesn't have it
|
||||
#define FE_INEXACT 0x0020
|
||||
#define FE_UNDERFLOW 0x0010
|
||||
#define FE_OVERFLOW 0x0008
|
||||
#define FE_DIVBYZERO 0x0004
|
||||
#define FE_INVALID 0x0001
|
||||
#define FE_ALL_EXCEPT 0x003D
|
||||
int fetestexcept(int excepts);
|
||||
int feclearexcept(int excepts);
|
||||
#else
|
||||
#include <fenv.h>
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// math.h
|
||||
//
|
||||
|
||||
#if defined( __INTEL_COMPILER )
|
||||
#include <mathimf.h>
|
||||
#else
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#if defined( _MSC_VER )
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846264338327950288
|
||||
#endif
|
||||
|
||||
#if ! defined( __INTEL_COMPILER )
|
||||
|
||||
#ifndef NAN
|
||||
#define NAN (INFINITY - INFINITY)
|
||||
#endif
|
||||
#ifndef HUGE_VALF
|
||||
#define HUGE_VALF (float)HUGE_VAL
|
||||
#endif
|
||||
#ifndef INFINITY
|
||||
#define INFINITY (FLT_MAX + FLT_MAX)
|
||||
#endif
|
||||
#ifndef isfinite
|
||||
#define isfinite(x) _finite(x)
|
||||
#endif
|
||||
#ifndef isnan
|
||||
#define isnan( x ) ((x) != (x))
|
||||
#endif
|
||||
#ifndef isinf
|
||||
#define isinf( _x) ((_x) == INFINITY || (_x) == -INFINITY)
|
||||
#endif
|
||||
|
||||
double rint( double x);
|
||||
float rintf( float x);
|
||||
long double rintl( long double x);
|
||||
|
||||
float cbrtf( float );
|
||||
double cbrt( double );
|
||||
|
||||
int ilogb( double x);
|
||||
int ilogbf (float x);
|
||||
int ilogbl(long double x);
|
||||
|
||||
double fmax(double x, double y);
|
||||
double fmin(double x, double y);
|
||||
float fmaxf( float x, float y );
|
||||
float fminf(float x, float y);
|
||||
|
||||
double log2(double x);
|
||||
long double log2l(long double x);
|
||||
|
||||
double exp2(double x);
|
||||
long double exp2l(long double x);
|
||||
|
||||
double fdim(double x, double y);
|
||||
float fdimf(float x, float y);
|
||||
long double fdiml(long double x, long double y);
|
||||
|
||||
double remquo( double x, double y, int *quo);
|
||||
float remquof( float x, float y, int *quo);
|
||||
long double remquol( long double x, long double y, int *quo);
|
||||
|
||||
long double scalblnl(long double x, long n);
|
||||
|
||||
// snprintf added in _MSC_VER == 1900 (Visual Studio 2015)
|
||||
#if defined( _MSC_VER ) && _MSC_VER < 1900
|
||||
#define snprintf sprintf_s
|
||||
#endif
|
||||
float hypotf(float x, float y);
|
||||
long double hypotl(long double x, long double y) ;
|
||||
double lgamma(double x);
|
||||
float lgammaf(float x);
|
||||
|
||||
double trunc(double x);
|
||||
float truncf(float x);
|
||||
|
||||
double log1p(double x);
|
||||
float log1pf(float x);
|
||||
long double log1pl(long double x);
|
||||
|
||||
double copysign(double x, double y);
|
||||
float copysignf(float x, float y);
|
||||
long double copysignl(long double x, long double y);
|
||||
|
||||
long lround(double x);
|
||||
long lroundf(float x);
|
||||
//long lroundl(long double x)
|
||||
|
||||
double round(double x);
|
||||
float roundf(float x);
|
||||
long double roundl(long double x);
|
||||
|
||||
int cf_signbit(double x);
|
||||
int cf_signbitf(float x);
|
||||
|
||||
// Added in _MSC_VER == 1800 (Visual Studio 2013)
|
||||
#if defined( _MSC_VER ) && _MSC_VER < 1800
|
||||
static int signbit(double x) { return cf_signbit(x); }
|
||||
#endif
|
||||
static int signbitf(float x) { return cf_signbitf(x); }
|
||||
|
||||
long int lrint (double flt);
|
||||
long int lrintf (float flt);
|
||||
|
||||
float int2float (int32_t ix);
|
||||
int32_t float2int (float fx);
|
||||
|
||||
#endif
|
||||
|
||||
#if ! defined( __INTEL_COMPILER ) || __INTEL_COMPILER < 1300
|
||||
// These functions appeared in Intel C v13.
|
||||
float nanf( const char* str);
|
||||
double nan( const char* str);
|
||||
long double nanl( const char* str);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined( __ANDROID__ )
|
||||
#define log2(X) (log(X)/log(2))
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//
|
||||
// stdio.h
|
||||
//
|
||||
|
||||
|
||||
|
||||
//
|
||||
// unistd.h
|
||||
//
|
||||
|
||||
#if defined( _MSC_VER )
|
||||
EXTERN_C unsigned int sleep( unsigned int sec );
|
||||
EXTERN_C int usleep( int usec );
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//
|
||||
// syscall.h
|
||||
//
|
||||
|
||||
#if defined( __ANDROID__ )
|
||||
// Android bionic's isn't providing SYS_sysctl wrappers.
|
||||
#define SYS__sysctl __NR__sysctl
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// Some tests use _malloca which defined in malloc.h.
|
||||
#if !defined (__APPLE__)
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// ???
|
||||
//
|
||||
|
||||
#if defined( _MSC_VER )
|
||||
|
||||
#define MAXPATHLEN _MAX_PATH
|
||||
|
||||
EXTERN_C uint64_t ReadTime( void );
|
||||
EXTERN_C double SubtractTime( uint64_t endTime, uint64_t startTime );
|
||||
|
||||
/** Returns the number of leading 0-bits in x,
|
||||
starting at the most significant bit position.
|
||||
If x is 0, the result is undefined.
|
||||
*/
|
||||
EXTERN_C int __builtin_clz(unsigned int pattern);
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x,y) (((x)<(y))?(x):(y))
|
||||
#endif
|
||||
#ifndef MAX
|
||||
#define MAX(x,y) (((x)>(y))?(x):(y))
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
------------------------------------------------------------------------------------------------
|
||||
WARNING: DO NOT USE THESE MACROS: MAKE_HEX_FLOAT, MAKE_HEX_DOUBLE, MAKE_HEX_LONG.
|
||||
|
||||
This is a typical usage of the macros:
|
||||
|
||||
double yhi = MAKE_HEX_DOUBLE(0x1.5555555555555p-2,0x15555555555555LL,-2);
|
||||
|
||||
(taken from math_brute_force/reference_math.c). There are two problems:
|
||||
|
||||
1. There is an error here. On Windows in will produce incorrect result
|
||||
`0x1.5555555555555p+50'. To have a correct result it should be written as
|
||||
`MAKE_HEX_DOUBLE(0x1.5555555555555p-2,0x15555555555555LL,-54)'. A proper value of the
|
||||
third argument is not obvious -- sometimes it should be the same as exponent of the
|
||||
first argument, but sometimes not.
|
||||
|
||||
2. Information is duplicated. It is easy to make a mistake.
|
||||
|
||||
Use HEX_FLT, HEX_DBL, HEX_LDBL macros instead (see them in the bottom of the file).
|
||||
------------------------------------------------------------------------------------------------
|
||||
*/
|
||||
#if defined ( _MSC_VER ) && ! defined( __INTEL_COMPILER )
|
||||
|
||||
#define MAKE_HEX_FLOAT(x,y,z) ((float)ldexp( (float)(y), z))
|
||||
#define MAKE_HEX_DOUBLE(x,y,z) ldexp( (double)(y), z)
|
||||
#define MAKE_HEX_LONG(x,y,z) ((long double) ldexp( (long double)(y), z))
|
||||
|
||||
#else
|
||||
|
||||
// Do not use these macros in new code, use HEX_FLT, HEX_DBL, HEX_LDBL instead.
|
||||
#define MAKE_HEX_FLOAT(x,y,z) x
|
||||
#define MAKE_HEX_DOUBLE(x,y,z) x
|
||||
#define MAKE_HEX_LONG(x,y,z) x
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
------------------------------------------------------------------------------------------------
|
||||
HEX_FLT, HEXT_DBL, HEX_LDBL -- Create hex floating point literal of type float, double, long
|
||||
double respectively. Arguments:
|
||||
|
||||
sm -- sign of number,
|
||||
int -- integer part of mantissa (without `0x' prefix),
|
||||
fract -- fractional part of mantissa (without decimal point and `L' or `LL' suffixes),
|
||||
se -- sign of exponent,
|
||||
exp -- absolute value of (binary) exponent.
|
||||
|
||||
Example:
|
||||
|
||||
double yhi = HEX_DBL( +, 1, 5555555555555, -, 2 ); // == 0x1.5555555555555p-2
|
||||
|
||||
Note:
|
||||
|
||||
We have to pass signs as separate arguments because gcc pass negative integer values
|
||||
(e. g. `-2') into a macro as two separate tokens, so `HEX_FLT( 1, 0, -2 )' produces result
|
||||
`0x1.0p- 2' (note a space between minus and two) which is not a correct floating point
|
||||
literal.
|
||||
------------------------------------------------------------------------------------------------
|
||||
*/
|
||||
#if defined ( _MSC_VER ) && ! defined( __INTEL_COMPILER )
|
||||
// If compiler does not support hex floating point literals:
|
||||
#define HEX_FLT( sm, int, fract, se, exp ) sm ldexpf( (float)( 0x ## int ## fract ## UL ), se exp + ilogbf( (float) 0x ## int ) - ilogbf( ( float )( 0x ## int ## fract ## UL ) ) )
|
||||
#define HEX_DBL( sm, int, fract, se, exp ) sm ldexp( (double)( 0x ## int ## fract ## ULL ), se exp + ilogb( (double) 0x ## int ) - ilogb( ( double )( 0x ## int ## fract ## ULL ) ) )
|
||||
#define HEX_LDBL( sm, int, fract, se, exp ) sm ldexpl( (long double)( 0x ## int ## fract ## ULL ), se exp + ilogbl( (long double) 0x ## int ) - ilogbl( ( long double )( 0x ## int ## fract ## ULL ) ) )
|
||||
#else
|
||||
// If compiler supports hex floating point literals: just concatenate all the parts into a literal.
|
||||
#define HEX_FLT( sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp ## F
|
||||
#define HEX_DBL( sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp
|
||||
#define HEX_LDBL( sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp ## L
|
||||
#endif
|
||||
|
||||
#if defined(__MINGW32__)
|
||||
#include <Windows.h>
|
||||
#define sleep(sec) Sleep((sec) * 1000)
|
||||
#endif
|
||||
|
||||
#endif // _COMPAT_H_
|
||||
@@ -1,164 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef _errorHelpers_h
|
||||
#define _errorHelpers_h
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define LOWER_IS_BETTER 0
|
||||
#define HIGHER_IS_BETTER 1
|
||||
|
||||
// If USE_ATF is defined, all log_error and log_info calls can be routed to test library
|
||||
// functions as described below. This is helpful for integration into an automated testing
|
||||
// system.
|
||||
#if USE_ATF
|
||||
// export BUILD_WITH_ATF=1
|
||||
#include <ATF/ATF.h>
|
||||
#define test_start() ATFTestStart()
|
||||
#define log_info ATFLogInfo
|
||||
#define log_error ATFLogError
|
||||
#define log_perf(_number, _higherBetter, _numType, _format, ...) ATFLogPerformanceNumber(_number, _higherBetter, _numType, _format, ##__VA_ARGS__)
|
||||
#define test_finish() ATFTestFinish()
|
||||
#define vlog_perf(_number, _higherBetter, _numType, _format, ...) ATFLogPerformanceNumber(_number, _higherBetter, _numType, _format,##__VA_ARGS__)
|
||||
#define vlog ATFLogInfo
|
||||
#define vlog_error ATFLogError
|
||||
#else
|
||||
#define test_start()
|
||||
#define log_info printf
|
||||
#define log_error printf
|
||||
#define log_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, \
|
||||
_higherBetter?"higher is better":"lower is better", _number )
|
||||
#define test_finish()
|
||||
#define vlog_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, \
|
||||
_higherBetter?"higher is better":"lower is better" , _number)
|
||||
#ifdef _WIN32
|
||||
#ifdef __MINGW32__
|
||||
// Use __mingw_printf since it supports "%a" format specifier
|
||||
#define vlog __mingw_printf
|
||||
#define vlog_error __mingw_printf
|
||||
#else
|
||||
// Use home-baked function that treats "%a" as "%f"
|
||||
static int vlog_win32(const char *format, ...);
|
||||
#define vlog vlog_win32
|
||||
#define vlog_error vlog_win32
|
||||
#endif
|
||||
#else
|
||||
#define vlog_error printf
|
||||
#define vlog printf
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define ct_assert(b) ct_assert_i(b, __LINE__)
|
||||
#define ct_assert_i(b, line) ct_assert_ii(b, line)
|
||||
#define ct_assert_ii(b, line) int _compile_time_assertion_on_line_##line[b ? 1 : -1];
|
||||
|
||||
#define test_error(errCode,msg) test_error_ret(errCode,msg,errCode)
|
||||
#define test_error_ret(errCode,msg,retValue) { if( errCode != CL_SUCCESS ) { print_error( errCode, msg ); return retValue ; } }
|
||||
#define print_error(errCode,msg) log_error( "ERROR: %s! (%s from %s:%d)\n", msg, IGetErrorString( errCode ), __FILE__, __LINE__ );
|
||||
|
||||
// expected error code vs. what we got
|
||||
#define test_failure_error(errCode, expectedErrCode, msg) test_failure_error_ret(errCode, expectedErrCode, msg, errCode != expectedErrCode)
|
||||
#define test_failure_error_ret(errCode, expectedErrCode, msg, retValue) { if( errCode != expectedErrCode ) { print_failure_error( errCode, expectedErrCode, msg ); return retValue ; } }
|
||||
#define print_failure_error(errCode, expectedErrCode, msg) log_error( "ERROR: %s! (Got %s, expected %s from %s:%d)\n", msg, IGetErrorString( errCode ), IGetErrorString( expectedErrCode ), __FILE__, __LINE__ );
|
||||
#define test_failure_warning(errCode, expectedErrCode, msg) test_failure_warning_ret(errCode, expectedErrCode, msg, errCode != expectedErrCode)
|
||||
#define test_failure_warning_ret(errCode, expectedErrCode, msg, retValue) { if( errCode != expectedErrCode ) { print_failure_warning( errCode, expectedErrCode, msg ); warnings++ ; } }
|
||||
#define print_failure_warning(errCode, expectedErrCode, msg) log_error( "WARNING: %s! (Got %s, expected %s from %s:%d)\n", msg, IGetErrorString( errCode ), IGetErrorString( expectedErrCode ), __FILE__, __LINE__ );
|
||||
|
||||
#define ASSERT_SUCCESS(expr, msg) \
|
||||
do \
|
||||
{ \
|
||||
cl_int _temp_retval = (expr); \
|
||||
if (_temp_retval != CL_SUCCESS) \
|
||||
{ \
|
||||
std::stringstream ss; \
|
||||
ss << "ERROR: " << msg << "=" << IGetErrorString(_temp_retval) \
|
||||
<< " at " << __FILE__ << ":" << __LINE__ << "\n"; \
|
||||
throw std::runtime_error(ss.str()); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
extern const char *IGetErrorString( int clErrorCode );
|
||||
|
||||
extern float Ulp_Error_Half( cl_ushort test, float reference );
|
||||
extern float Ulp_Error( float test, double reference );
|
||||
extern float Ulp_Error_Double( double test, long double reference );
|
||||
|
||||
extern const char *GetChannelTypeName( cl_channel_type type );
|
||||
extern int IsChannelTypeSupported( cl_channel_type type );
|
||||
extern const char *GetChannelOrderName( cl_channel_order order );
|
||||
extern int IsChannelOrderSupported( cl_channel_order order );
|
||||
extern const char *GetAddressModeName( cl_addressing_mode mode );
|
||||
|
||||
extern const char *GetDeviceTypeName( cl_device_type type );
|
||||
|
||||
// NON-REENTRANT UNLESS YOU PROVIDE A BUFFER PTR (pass null to use static storage, but it's not reentrant then!)
|
||||
extern const char *GetDataVectorString( void *dataBuffer, size_t typeSize, size_t vecSize, char *buffer );
|
||||
|
||||
#if defined (_WIN32) && !defined(__MINGW32__)
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
static int vlog_win32(const char *format, ...)
|
||||
{
|
||||
const char *new_format = format;
|
||||
|
||||
if (strstr(format, "%a")) {
|
||||
char *temp;
|
||||
if ((temp = strdup(format)) == NULL) {
|
||||
printf("vlog_win32: Failed to allocate memory for strdup\n");
|
||||
return -1;
|
||||
}
|
||||
new_format = temp;
|
||||
while (*temp) {
|
||||
// replace %a with %f
|
||||
if ((*temp == '%') && (*(temp+1) == 'a')) {
|
||||
*(temp+1) = 'f';
|
||||
}
|
||||
temp++;
|
||||
}
|
||||
}
|
||||
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
vprintf(new_format, args);
|
||||
va_end(args);
|
||||
|
||||
if (new_format != format) {
|
||||
free((void*)new_format);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _errorHelpers_h
|
||||
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef _fpcontrol_h
|
||||
#define _fpcontrol_h
|
||||
|
||||
// In order to get tests for correctly rounded operations (e.g. multiply) to work properly we need to be able to set the reference hardware
|
||||
// to FTZ mode if the device hardware is running in that mode. We have explored all other options short of writing correctly rounded operations
|
||||
// in integer code, and have found this is the only way to correctly verify operation.
|
||||
//
|
||||
// Non-Apple implementations will need to provide their own implentation for these features. If the reference hardware and device are both
|
||||
// running in the same state (either FTZ or IEEE compliant modes) then these functions may be empty. If the device is running in non-default
|
||||
// rounding mode (e.g. round toward zero), then these functions should also set the reference device into that rounding mode.
|
||||
#if defined( __APPLE__ ) || defined( _MSC_VER ) || defined( __linux__ ) || defined (__MINGW32__)
|
||||
typedef int FPU_mode_type;
|
||||
#if defined( __i386__ ) || defined( __x86_64__ )
|
||||
#include <xmmintrin.h>
|
||||
#elif defined( __PPC__ )
|
||||
#include <fpu_control.h>
|
||||
extern __thread fpu_control_t fpu_control;
|
||||
#endif
|
||||
// Set the reference hardware floating point unit to FTZ mode
|
||||
static inline void ForceFTZ( FPU_mode_type *mode )
|
||||
{
|
||||
#if defined( __i386__ ) || defined( __x86_64__ ) || defined( _MSC_VER ) || defined (__MINGW32__)
|
||||
*mode = _mm_getcsr();
|
||||
_mm_setcsr( *mode | 0x8040);
|
||||
#elif defined( __PPC__ )
|
||||
*mode = fpu_control;
|
||||
fpu_control |= _FPU_MASK_NI;
|
||||
#elif defined ( __arm__ )
|
||||
unsigned fpscr;
|
||||
__asm__ volatile ("fmrx %0, fpscr" : "=r"(fpscr));
|
||||
*mode = fpscr;
|
||||
__asm__ volatile ("fmxr fpscr, %0" :: "r"(fpscr | (1U << 24)));
|
||||
// Add 64 bit support
|
||||
#elif defined (__aarch64__)
|
||||
unsigned fpscr;
|
||||
__asm__ volatile ("mrs %0, fpcr" : "=r"(fpscr));
|
||||
*mode = fpscr;
|
||||
__asm__ volatile ("msr fpcr, %0" :: "r"(fpscr | (1U << 24)));
|
||||
#else
|
||||
#error ForceFTZ needs an implentation
|
||||
#endif
|
||||
}
|
||||
|
||||
// Disable the denorm flush to zero
|
||||
static inline void DisableFTZ( FPU_mode_type *mode )
|
||||
{
|
||||
#if defined( __i386__ ) || defined( __x86_64__ ) || defined( _MSC_VER ) || defined (__MINGW32__)
|
||||
*mode = _mm_getcsr();
|
||||
_mm_setcsr( *mode & ~0x8040);
|
||||
#elif defined( __PPC__ )
|
||||
*mode = fpu_control;
|
||||
fpu_control &= ~_FPU_MASK_NI;
|
||||
#elif defined ( __arm__ )
|
||||
unsigned fpscr;
|
||||
__asm__ volatile ("fmrx %0, fpscr" : "=r"(fpscr));
|
||||
*mode = fpscr;
|
||||
__asm__ volatile ("fmxr fpscr, %0" :: "r"(fpscr & ~(1U << 24)));
|
||||
// Add 64 bit support
|
||||
#elif defined (__aarch64__)
|
||||
unsigned fpscr;
|
||||
__asm__ volatile ("mrs %0, fpcr" : "=r"(fpscr));
|
||||
*mode = fpscr;
|
||||
__asm__ volatile ("msr fpcr, %0" :: "r"(fpscr & ~(1U << 24)));
|
||||
#else
|
||||
#error DisableFTZ needs an implentation
|
||||
#endif
|
||||
}
|
||||
|
||||
// Restore the reference hardware to floating point state indicated by *mode
|
||||
static inline void RestoreFPState( FPU_mode_type *mode )
|
||||
{
|
||||
#if defined( __i386__ ) || defined( __x86_64__ ) || defined( _MSC_VER ) || defined (__MINGW32__)
|
||||
_mm_setcsr( *mode );
|
||||
#elif defined( __PPC__)
|
||||
fpu_control = *mode;
|
||||
#elif defined (__arm__)
|
||||
__asm__ volatile ("fmxr fpscr, %0" :: "r"(*mode));
|
||||
// Add 64 bit support
|
||||
#elif defined (__aarch64__)
|
||||
__asm__ volatile ("msr fpcr, %0" :: "r"(*mode));
|
||||
#else
|
||||
#error RestoreFPState needs an implementation
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
#error ForceFTZ and RestoreFPState need implentations
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,773 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "compat.h"
|
||||
|
||||
#if defined ( _MSC_VER )
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
#if ! defined( __INTEL_COMPILER )
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// rint, rintf
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
float copysignf( float x, float y )
|
||||
{
|
||||
union{ cl_uint u; float f; }ux, uy;
|
||||
|
||||
ux.f = x;
|
||||
uy.f = y;
|
||||
|
||||
ux.u = (ux.u & 0x7fffffffU) | (uy.u & 0x80000000U);
|
||||
|
||||
return ux.f;
|
||||
}
|
||||
|
||||
double copysign( double x, double y )
|
||||
{
|
||||
union{ cl_ulong u; double f; }ux, uy;
|
||||
|
||||
ux.f = x;
|
||||
uy.f = y;
|
||||
|
||||
ux.u = (ux.u & 0x7fffffffffffffffULL) | (uy.u & 0x8000000000000000ULL);
|
||||
|
||||
return ux.f;
|
||||
}
|
||||
|
||||
long double copysignl( long double x, long double y )
|
||||
{
|
||||
union
|
||||
{
|
||||
long double f;
|
||||
struct{ cl_ulong m; cl_ushort sexp; }u;
|
||||
}ux, uy;
|
||||
|
||||
ux.f = x;
|
||||
uy.f = y;
|
||||
|
||||
ux.u.sexp = (ux.u.sexp & 0x7fff) | (uy.u.sexp & 0x8000);
|
||||
|
||||
return ux.f;
|
||||
}
|
||||
|
||||
float rintf(float x)
|
||||
{
|
||||
float absx = fabsf(x);
|
||||
|
||||
if( absx < 8388608.0f /* 0x1.0p23f */ )
|
||||
{
|
||||
float magic = copysignf( 8388608.0f /* 0x1.0p23f */, x );
|
||||
float rounded = x + magic;
|
||||
rounded -= magic;
|
||||
x = copysignf( rounded, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
double rint(double x)
|
||||
{
|
||||
double absx = fabs(x);
|
||||
|
||||
if( absx < 4503599627370496.0 /* 0x1.0p52f */ )
|
||||
{
|
||||
double magic = copysign( 4503599627370496.0 /* 0x1.0p52 */, x );
|
||||
double rounded = x + magic;
|
||||
rounded -= magic;
|
||||
x = copysign( rounded, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
long double rintl(long double x)
|
||||
{
|
||||
double absx = fabs(x);
|
||||
|
||||
if( absx < 9223372036854775808.0L /* 0x1.0p64f */ )
|
||||
{
|
||||
long double magic = copysignl( 9223372036854775808.0L /* 0x1.0p63L */, x );
|
||||
long double rounded = x + magic;
|
||||
rounded -= magic;
|
||||
x = copysignl( rounded, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// ilogb, ilogbf, ilogbl
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////
|
||||
#ifndef FP_ILOGB0
|
||||
#define FP_ILOGB0 INT_MIN
|
||||
#endif
|
||||
|
||||
#ifndef FP_ILOGBNAN
|
||||
#define FP_ILOGBNAN INT_MIN
|
||||
#endif
|
||||
|
||||
int ilogb (double x)
|
||||
{
|
||||
union{ double f; cl_ulong u;} u;
|
||||
u.f = x;
|
||||
|
||||
cl_ulong absx = u.u & CL_LONG_MAX;
|
||||
if( absx - 0x0001000000000000ULL >= 0x7ff0000000000000ULL - 0x0001000000000000ULL)
|
||||
{
|
||||
switch( absx )
|
||||
{
|
||||
case 0:
|
||||
return FP_ILOGB0;
|
||||
case 0x7ff0000000000000ULL:
|
||||
return INT_MAX;
|
||||
default:
|
||||
if( absx > 0x7ff0000000000000ULL )
|
||||
return FP_ILOGBNAN;
|
||||
|
||||
// subnormal
|
||||
u.u = absx | 0x3ff0000000000000ULL;
|
||||
u.f -= 1.0;
|
||||
return (u.u >> 52) - (1023 + 1022);
|
||||
}
|
||||
}
|
||||
|
||||
return (absx >> 52) - 1023;
|
||||
}
|
||||
|
||||
|
||||
int ilogbf (float x)
|
||||
{
|
||||
union{ float f; cl_uint u;} u;
|
||||
u.f = x;
|
||||
|
||||
cl_uint absx = u.u & 0x7fffffff;
|
||||
if( absx - 0x00800000U >= 0x7f800000U - 0x00800000U)
|
||||
{
|
||||
switch( absx )
|
||||
{
|
||||
case 0:
|
||||
return FP_ILOGB0;
|
||||
case 0x7f800000U:
|
||||
return INT_MAX;
|
||||
default:
|
||||
if( absx > 0x7f800000 )
|
||||
return FP_ILOGBNAN;
|
||||
|
||||
// subnormal
|
||||
u.u = absx | 0x3f800000U;
|
||||
u.f -= 1.0f;
|
||||
return (u.u >> 23) - (127 + 126);
|
||||
}
|
||||
}
|
||||
|
||||
return (absx >> 23) - 127;
|
||||
}
|
||||
|
||||
int ilogbl (long double x)
|
||||
{
|
||||
union
|
||||
{
|
||||
long double f;
|
||||
struct{ cl_ulong m; cl_ushort sexp; }u;
|
||||
} u;
|
||||
u.f = x;
|
||||
|
||||
int exp = u.u.sexp & 0x7fff;
|
||||
if( 0 == exp )
|
||||
{
|
||||
if( 0 == u.u.m )
|
||||
return FP_ILOGB0;
|
||||
|
||||
//subnormal
|
||||
u.u.sexp = 0x3fff;
|
||||
u.f -= 1.0f;
|
||||
exp = u.u.sexp & 0x7fff;
|
||||
|
||||
return exp - (0x3fff + 0x3ffe);
|
||||
}
|
||||
else if( 0x7fff == exp )
|
||||
{
|
||||
if( u.u.m & CL_LONG_MAX )
|
||||
return FP_ILOGBNAN;
|
||||
|
||||
return INT_MAX;
|
||||
}
|
||||
|
||||
return exp - 0x3fff;
|
||||
}
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// fmax, fmin, fmaxf, fminf
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
static void GET_BITS_SP32(float fx, unsigned int* ux)
|
||||
{
|
||||
volatile union {float f; unsigned int u;} _bitsy;
|
||||
_bitsy.f = (fx);
|
||||
*ux = _bitsy.u;
|
||||
}
|
||||
/* static void GET_BITS_SP32(float fx, unsigned int* ux) */
|
||||
/* { */
|
||||
/* volatile union {float f; unsigned int i;} _bitsy; */
|
||||
/* _bitsy.f = (fx); */
|
||||
/* *ux = _bitsy.i; */
|
||||
/* } */
|
||||
static void PUT_BITS_SP32(unsigned int ux, float* fx)
|
||||
{
|
||||
volatile union {float f; unsigned int u;} _bitsy;
|
||||
_bitsy.u = (ux);
|
||||
*fx = _bitsy.f;
|
||||
}
|
||||
/* static void PUT_BITS_SP32(unsigned int ux, float* fx) */
|
||||
/* { */
|
||||
/* volatile union {float f; unsigned int i;} _bitsy; */
|
||||
/* _bitsy.i = (ux); */
|
||||
/* *fx = _bitsy.f; */
|
||||
/* } */
|
||||
static void GET_BITS_DP64(double dx, unsigned __int64* lx)
|
||||
{
|
||||
volatile union {double d; unsigned __int64 l;} _bitsy;
|
||||
_bitsy.d = (dx);
|
||||
*lx = _bitsy.l;
|
||||
}
|
||||
static void PUT_BITS_DP64(unsigned __int64 lx, double* dx)
|
||||
{
|
||||
volatile union {double d; unsigned __int64 l;} _bitsy;
|
||||
_bitsy.l = (lx);
|
||||
*dx = _bitsy.d;
|
||||
}
|
||||
|
||||
#if 0
|
||||
int SIGNBIT_DP64(double x )
|
||||
{
|
||||
int hx;
|
||||
_GET_HIGH_WORD(hx,x);
|
||||
return((hx>>31));
|
||||
}
|
||||
#endif
|
||||
|
||||
/* fmax(x, y) returns the larger (more positive) of x and y.
|
||||
NaNs are treated as missing values: if one argument is NaN,
|
||||
the other argument is returned. If both arguments are NaN,
|
||||
the first argument is returned. */
|
||||
|
||||
/* This works so long as the compiler knows that (x != x) means
|
||||
that x is NaN; gcc does. */
|
||||
double fmax(double x, double y)
|
||||
{
|
||||
if( isnan(y) )
|
||||
return x;
|
||||
|
||||
return x >= y ? x : y;
|
||||
}
|
||||
|
||||
|
||||
/* fmin(x, y) returns the smaller (more negative) of x and y.
|
||||
NaNs are treated as missing values: if one argument is NaN,
|
||||
the other argument is returned. If both arguments are NaN,
|
||||
the first argument is returned. */
|
||||
|
||||
double fmin(double x, double y)
|
||||
{
|
||||
if( isnan(y) )
|
||||
return x;
|
||||
|
||||
return x <= y ? x : y;
|
||||
}
|
||||
|
||||
|
||||
float fmaxf( float x, float y )
|
||||
{
|
||||
if( isnan(y) )
|
||||
return x;
|
||||
|
||||
return x >= y ? x : y;
|
||||
}
|
||||
|
||||
/* fminf(x, y) returns the smaller (more negative) of x and y.
|
||||
NaNs are treated as missing values: if one argument is NaN,
|
||||
the other argument is returned. If both arguments are NaN,
|
||||
the first argument is returned. */
|
||||
|
||||
float fminf(float x, float y)
|
||||
{
|
||||
if( isnan(y) )
|
||||
return x;
|
||||
|
||||
return x <= y ? x : y;
|
||||
}
|
||||
|
||||
long double scalblnl(long double x, long n)
|
||||
{
|
||||
union
|
||||
{
|
||||
long double d;
|
||||
struct{ cl_ulong m; cl_ushort sexp;}u;
|
||||
}u;
|
||||
u.u.m = CL_LONG_MIN;
|
||||
|
||||
if( x == 0.0L || n < -2200)
|
||||
return copysignl( 0.0L, x );
|
||||
|
||||
if( n > 2200 )
|
||||
return INFINITY;
|
||||
|
||||
if( n < 0 )
|
||||
{
|
||||
u.u.sexp = 0x3fff - 1022;
|
||||
while( n <= -1022 )
|
||||
{
|
||||
x *= u.d;
|
||||
n += 1022;
|
||||
}
|
||||
u.u.sexp = 0x3fff + n;
|
||||
x *= u.d;
|
||||
return x;
|
||||
}
|
||||
|
||||
if( n > 0 )
|
||||
{
|
||||
u.u.sexp = 0x3fff + 1023;
|
||||
while( n >= 1023 )
|
||||
{
|
||||
x *= u.d;
|
||||
n -= 1023;
|
||||
}
|
||||
u.u.sexp = 0x3fff + n;
|
||||
x *= u.d;
|
||||
return x;
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// log2
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////
|
||||
const static cl_double log_e_base2 = 1.4426950408889634074;
|
||||
const static cl_double log_10_base2 = 3.3219280948873623478;
|
||||
|
||||
//double log10(double x);
|
||||
|
||||
double log2(double x)
|
||||
{
|
||||
return 1.44269504088896340735992468100189214 * log(x);
|
||||
}
|
||||
|
||||
long double log2l(long double x)
|
||||
{
|
||||
return 1.44269504088896340735992468100189214L * log(x);
|
||||
}
|
||||
|
||||
double trunc(double x)
|
||||
{
|
||||
double absx = fabs(x);
|
||||
|
||||
if( absx < 4503599627370496.0 /* 0x1.0p52f */ )
|
||||
{
|
||||
cl_long rounded = x;
|
||||
x = copysign( (double) rounded, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
float truncf(float x)
|
||||
{
|
||||
float absx = fabsf(x);
|
||||
|
||||
if( absx < 8388608.0f /* 0x1.0p23f */ )
|
||||
{
|
||||
cl_int rounded = x;
|
||||
x = copysignf( (float) rounded, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
long lround(double x)
|
||||
{
|
||||
double absx = fabs(x);
|
||||
|
||||
if( absx < 0.5 )
|
||||
return 0;
|
||||
|
||||
if( absx < 4503599627370496.0 /* 0x1.0p52 */)
|
||||
{
|
||||
absx += 0.5;
|
||||
cl_long rounded = absx;
|
||||
absx = rounded;
|
||||
x = copysign( absx, x );
|
||||
}
|
||||
|
||||
if( x >= (double) LONG_MAX )
|
||||
return LONG_MAX;
|
||||
|
||||
return (long) x;
|
||||
}
|
||||
|
||||
long lroundf(float x)
|
||||
{
|
||||
float absx = fabsf(x);
|
||||
|
||||
if( absx < 0.5f )
|
||||
return 0;
|
||||
|
||||
if( absx < 8388608.0f )
|
||||
{
|
||||
absx += 0.5f;
|
||||
cl_int rounded = absx;
|
||||
absx = rounded;
|
||||
x = copysignf( absx, x );
|
||||
}
|
||||
|
||||
if( x >= (float) LONG_MAX )
|
||||
return LONG_MAX;
|
||||
|
||||
return (long) x;
|
||||
}
|
||||
|
||||
double round(double x)
|
||||
{
|
||||
double absx = fabs(x);
|
||||
|
||||
if( absx < 0.5 )
|
||||
return copysign( 0.0, x);
|
||||
|
||||
if( absx < 4503599627370496.0 /* 0x1.0p52 */)
|
||||
{
|
||||
absx += 0.5;
|
||||
cl_long rounded = absx;
|
||||
absx = rounded;
|
||||
x = copysign( absx, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
float roundf(float x)
|
||||
{
|
||||
float absx = fabsf(x);
|
||||
|
||||
if( absx < 0.5f )
|
||||
return copysignf( 0.0f, x);
|
||||
|
||||
if( absx < 8388608.0f )
|
||||
{
|
||||
absx += 0.5f;
|
||||
cl_int rounded = absx;
|
||||
absx = rounded;
|
||||
x = copysignf( absx, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
long double roundl(long double x)
|
||||
{
|
||||
long double absx = fabsl(x);
|
||||
|
||||
if( absx < 0.5L )
|
||||
return copysignl( 0.0L, x);
|
||||
|
||||
if( absx < 9223372036854775808.0L /*0x1.0p63L*/ )
|
||||
{
|
||||
absx += 0.5L;
|
||||
cl_ulong rounded = absx;
|
||||
absx = rounded;
|
||||
x = copysignl( absx, x );
|
||||
}
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
float cbrtf( float x )
|
||||
{
|
||||
float z = pow( fabs((double) x), 1.0 / 3.0 );
|
||||
return copysignf( z, x );
|
||||
}
|
||||
|
||||
double cbrt( double x )
|
||||
{
|
||||
return copysign( pow( fabs( x ), 1.0 / 3.0 ), x );
|
||||
}
|
||||
|
||||
long int lrint (double x)
|
||||
{
|
||||
double absx = fabs(x);
|
||||
|
||||
if( x >= (double) LONG_MAX )
|
||||
return LONG_MAX;
|
||||
|
||||
if( absx < 4503599627370496.0 /* 0x1.0p52 */ )
|
||||
{
|
||||
double magic = copysign( 4503599627370496.0 /* 0x1.0p52 */, x );
|
||||
double rounded = x + magic;
|
||||
rounded -= magic;
|
||||
return (long int) rounded;
|
||||
}
|
||||
|
||||
return (long int) x;
|
||||
}
|
||||
|
||||
long int lrintf (float x)
|
||||
{
|
||||
float absx = fabsf(x);
|
||||
|
||||
if( x >= (float) LONG_MAX )
|
||||
return LONG_MAX;
|
||||
|
||||
if( absx < 8388608.0f /* 0x1.0p23f */ )
|
||||
{
|
||||
float magic = copysignf( 8388608.0f /* 0x1.0p23f */, x );
|
||||
float rounded = x + magic;
|
||||
rounded -= magic;
|
||||
return (long int) rounded;
|
||||
}
|
||||
|
||||
return (long int) x;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// fenv functions
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////
|
||||
#if _MSC_VER < 1900
|
||||
int fetestexcept(int excepts)
|
||||
{
|
||||
unsigned int status = _statusfp();
|
||||
return excepts & (
|
||||
((status & _SW_INEXACT) ? FE_INEXACT : 0) |
|
||||
((status & _SW_UNDERFLOW) ? FE_UNDERFLOW : 0) |
|
||||
((status & _SW_OVERFLOW) ? FE_OVERFLOW : 0) |
|
||||
((status & _SW_ZERODIVIDE) ? FE_DIVBYZERO : 0) |
|
||||
((status & _SW_INVALID) ? FE_INVALID : 0)
|
||||
);
|
||||
}
|
||||
|
||||
int feclearexcept(int excepts)
|
||||
{
|
||||
_clearfp();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __INTEL_COMPILER
|
||||
|
||||
#if ! defined( __INTEL_COMPILER ) || __INTEL_COMPILER < 1300
|
||||
|
||||
float make_nan()
|
||||
{
|
||||
/* This is the IEEE 754 single-precision format:
|
||||
unsigned int mantissa: 22;
|
||||
unsigned int quiet_nan: 1;
|
||||
unsigned int exponent: 8;
|
||||
unsigned int negative: 1;
|
||||
*/
|
||||
//const static unsigned
|
||||
static const int32_t _nan = 0x7fc00000;
|
||||
return *(const float*)(&_nan);
|
||||
}
|
||||
|
||||
float nanf( const char* str)
|
||||
{
|
||||
cl_uint u = atoi( str );
|
||||
u |= 0x7fc00000U;
|
||||
return *( float*)(&u);
|
||||
}
|
||||
|
||||
|
||||
double nan( const char* str)
|
||||
{
|
||||
cl_ulong u = atoi( str );
|
||||
u |= 0x7ff8000000000000ULL;
|
||||
return *( double*)(&u);
|
||||
}
|
||||
|
||||
// double check this implementatation
|
||||
long double nanl( const char* str)
|
||||
{
|
||||
union
|
||||
{
|
||||
long double f;
|
||||
struct { cl_ulong m; cl_ushort sexp; }u;
|
||||
}u;
|
||||
u.u.sexp = 0x7fff;
|
||||
u.u.m = 0x8000000000000000ULL | atoi( str );
|
||||
|
||||
return u.f;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// misc functions
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
// This function is commented out because the Windows implementation should never call munmap.
|
||||
// If it is calling it, we have a bug. Please file a bugzilla.
|
||||
int munmap(void *addr, size_t len)
|
||||
{
|
||||
// FIXME: this is not correct. munmap is like free() http://www.opengroup.org/onlinepubs/7990989775/xsh/munmap.html
|
||||
|
||||
return (int)VirtualAlloc( (LPVOID)addr, len,
|
||||
MEM_COMMIT|MEM_RESERVE, PAGE_NOACCESS );
|
||||
}
|
||||
*/
|
||||
|
||||
uint64_t ReadTime( void )
|
||||
{
|
||||
LARGE_INTEGER current;
|
||||
QueryPerformanceCounter(¤t);
|
||||
return (uint64_t)current.QuadPart;
|
||||
}
|
||||
|
||||
double SubtractTime( uint64_t endTime, uint64_t startTime )
|
||||
{
|
||||
static double PerformanceFrequency = 0.0;
|
||||
|
||||
if (PerformanceFrequency == 0.0) {
|
||||
LARGE_INTEGER frequency;
|
||||
QueryPerformanceFrequency(&frequency);
|
||||
PerformanceFrequency = (double) frequency.QuadPart;
|
||||
}
|
||||
|
||||
return (double)(endTime - startTime) / PerformanceFrequency * 1e9;
|
||||
}
|
||||
|
||||
int cf_signbit(double x)
|
||||
{
|
||||
union
|
||||
{
|
||||
double f;
|
||||
cl_ulong u;
|
||||
}u;
|
||||
u.f = x;
|
||||
return u.u >> 63;
|
||||
}
|
||||
|
||||
int cf_signbitf(float x)
|
||||
{
|
||||
union
|
||||
{
|
||||
float f;
|
||||
cl_uint u;
|
||||
}u;
|
||||
u.f = x;
|
||||
return u.u >> 31;
|
||||
}
|
||||
|
||||
float int2float (int32_t ix)
|
||||
{
|
||||
union {
|
||||
float f;
|
||||
int32_t i;
|
||||
} u;
|
||||
u.i = ix;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
int32_t float2int (float fx)
|
||||
{
|
||||
union {
|
||||
float f;
|
||||
int32_t i;
|
||||
} u;
|
||||
u.f = fx;
|
||||
return u.i;
|
||||
}
|
||||
|
||||
#if !defined(_WIN64)
|
||||
/** Returns the number of leading 0-bits in x,
|
||||
starting at the most significant bit position.
|
||||
If x is 0, the result is undefined.
|
||||
*/
|
||||
int __builtin_clz(unsigned int pattern)
|
||||
{
|
||||
#if 0
|
||||
int res;
|
||||
__asm {
|
||||
mov eax, pattern
|
||||
bsr eax, eax
|
||||
mov res, eax
|
||||
}
|
||||
return 31 - res;
|
||||
#endif
|
||||
unsigned long index;
|
||||
unsigned char res = _BitScanReverse( &index, pattern);
|
||||
if (res) {
|
||||
return 8*sizeof(int) - 1 - index;
|
||||
} else {
|
||||
return 8*sizeof(int);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int __builtin_clz(unsigned int pattern)
|
||||
{
|
||||
int count;
|
||||
if (pattern == 0u) {
|
||||
return 32;
|
||||
}
|
||||
count = 31;
|
||||
if (pattern >= 1u<<16) { pattern >>= 16; count -= 16; }
|
||||
if (pattern >= 1u<<8) { pattern >>= 8; count -= 8; }
|
||||
if (pattern >= 1u<<4) { pattern >>= 4; count -= 4; }
|
||||
if (pattern >= 1u<<2) { pattern >>= 2; count -= 2; }
|
||||
if (pattern >= 1u<<1) { count -= 1; }
|
||||
return count;
|
||||
}
|
||||
|
||||
#endif // !defined(_WIN64)
|
||||
|
||||
#include <intrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
int usleep(int usec)
|
||||
{
|
||||
Sleep((usec + 999) / 1000);
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int sleep( unsigned int sec )
|
||||
{
|
||||
Sleep( sec * 1000 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined( _MSC_VER )
|
||||
@@ -1,274 +0,0 @@
|
||||
/*
|
||||
A C-program for MT19937, with initialization improved 2002/1/26.
|
||||
Coded by Takuji Nishimura and Makoto Matsumoto.
|
||||
|
||||
Before using, initialize the state by using init_genrand(seed)
|
||||
or init_by_array(init_key, key_length).
|
||||
|
||||
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. The names of its contributors may not be used to endorse or promote
|
||||
products derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
Any feedback is very welcome.
|
||||
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
|
||||
email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
|
||||
|
||||
Modifications for use in OpenCL by Ian Ollmann, Apple Inc.
|
||||
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "mt19937.h"
|
||||
#include "mingw_compat.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
static void * align_malloc(size_t size, size_t alignment)
|
||||
{
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
return _aligned_malloc(size, alignment);
|
||||
#elif defined(__linux__) || defined (linux) || defined(__APPLE__)
|
||||
void * ptr = NULL;
|
||||
if (0 == posix_memalign(&ptr, alignment, size))
|
||||
return ptr;
|
||||
return NULL;
|
||||
#elif defined(__MINGW32__)
|
||||
return __mingw_aligned_malloc(size, alignment);
|
||||
#else
|
||||
#error "Please add support OS for aligned malloc"
|
||||
#endif
|
||||
}
|
||||
|
||||
static void align_free(void * ptr)
|
||||
{
|
||||
#if defined(_WIN32) && defined(_MSC_VER)
|
||||
_aligned_free(ptr);
|
||||
#elif defined(__linux__) || defined (linux) || defined(__APPLE__)
|
||||
return free(ptr);
|
||||
#elif defined(__MINGW32__)
|
||||
return __mingw_aligned_free(ptr);
|
||||
#else
|
||||
#error "Please add support OS for aligned free"
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* Period parameters */
|
||||
#define N 624 /* vector code requires multiple of 4 here */
|
||||
#define M 397
|
||||
#define MATRIX_A (cl_uint) 0x9908b0dfUL /* constant vector a */
|
||||
#define UPPER_MASK (cl_uint) 0x80000000UL /* most significant w-r bits */
|
||||
#define LOWER_MASK (cl_uint) 0x7fffffffUL /* least significant r bits */
|
||||
|
||||
typedef struct _MTdata
|
||||
{
|
||||
cl_uint mt[N];
|
||||
#ifdef __SSE2__
|
||||
cl_uint cache[N];
|
||||
#endif
|
||||
cl_int mti;
|
||||
}_MTdata;
|
||||
|
||||
/* initializes mt[N] with a seed */
|
||||
MTdata init_genrand(cl_uint s)
|
||||
{
|
||||
MTdata r = (MTdata) align_malloc( sizeof( _MTdata ), 16 );
|
||||
if( NULL != r )
|
||||
{
|
||||
cl_uint *mt = r->mt;
|
||||
int mti = 0;
|
||||
mt[0]= s; // & 0xffffffffUL;
|
||||
for (mti=1; mti<N; mti++) {
|
||||
mt[mti] = (cl_uint)
|
||||
(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
|
||||
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
|
||||
/* In the previous versions, MSBs of the seed affect */
|
||||
/* only MSBs of the array mt[]. */
|
||||
/* 2002/01/09 modified by Makoto Matsumoto */
|
||||
// mt[mti] &= 0xffffffffUL;
|
||||
/* for >32 bit machines */
|
||||
}
|
||||
r->mti = mti;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void free_mtdata( MTdata d )
|
||||
{
|
||||
if(d)
|
||||
align_free(d);
|
||||
}
|
||||
|
||||
/* generates a random number on [0,0xffffffff]-interval */
|
||||
cl_uint genrand_int32( MTdata d)
|
||||
{
|
||||
/* mag01[x] = x * MATRIX_A for x=0,1 */
|
||||
static const cl_uint mag01[2]={0x0UL, MATRIX_A};
|
||||
#ifdef __SSE2__
|
||||
static volatile int init = 0;
|
||||
static union{ __m128i v; cl_uint s[4]; } upper_mask, lower_mask, one, matrix_a, c0, c1;
|
||||
#endif
|
||||
|
||||
|
||||
cl_uint *mt = d->mt;
|
||||
cl_uint y;
|
||||
|
||||
if (d->mti == N)
|
||||
{ /* generate N words at one time */
|
||||
int kk;
|
||||
|
||||
#ifdef __SSE2__
|
||||
if( 0 == init )
|
||||
{
|
||||
upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] = upper_mask.s[3] = UPPER_MASK;
|
||||
lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] = lower_mask.s[3] = LOWER_MASK;
|
||||
one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1;
|
||||
matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] = MATRIX_A;
|
||||
c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint) 0x9d2c5680UL;
|
||||
c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint) 0xefc60000UL;
|
||||
init = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
kk = 0;
|
||||
#ifdef __SSE2__
|
||||
// vector loop
|
||||
for( ; kk + 4 <= N-M; kk += 4 )
|
||||
{
|
||||
__m128i vy = _mm_or_si128( _mm_and_si128( _mm_load_si128( (__m128i*)(mt + kk) ), upper_mask.v ),
|
||||
_mm_and_si128( _mm_loadu_si128( (__m128i*)(mt + kk + 1) ), lower_mask.v )); // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
|
||||
|
||||
__m128i mask = _mm_cmpeq_epi32( _mm_and_si128( vy, one.v), one.v ); // y & 1 ? -1 : 0
|
||||
__m128i vmag01 = _mm_and_si128( mask, matrix_a.v ); // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
|
||||
__m128i vr = _mm_xor_si128( _mm_loadu_si128( (__m128i*)(mt + kk + M)), (__m128i) _mm_srli_epi32( vy, 1 ) ); // mt[kk+M] ^ (y >> 1)
|
||||
vr = _mm_xor_si128( vr, vmag01 ); // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
|
||||
_mm_store_si128( (__m128i*) (mt + kk ), vr );
|
||||
}
|
||||
#endif
|
||||
for ( ;kk<N-M;kk++) {
|
||||
y = (cl_uint) ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK));
|
||||
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
|
||||
}
|
||||
|
||||
#ifdef __SSE2__
|
||||
// advance to next aligned location
|
||||
for (;kk<N-1 && (kk & 3);kk++) {
|
||||
y = (cl_uint) ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK));
|
||||
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
|
||||
}
|
||||
|
||||
// vector loop
|
||||
for( ; kk + 4 <= N-1; kk += 4 )
|
||||
{
|
||||
__m128i vy = _mm_or_si128( _mm_and_si128( _mm_load_si128( (__m128i*)(mt + kk) ), upper_mask.v ),
|
||||
_mm_and_si128( _mm_loadu_si128( (__m128i*)(mt + kk + 1) ), lower_mask.v )); // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
|
||||
|
||||
__m128i mask = _mm_cmpeq_epi32( _mm_and_si128( vy, one.v), one.v ); // y & 1 ? -1 : 0
|
||||
__m128i vmag01 = _mm_and_si128( mask, matrix_a.v ); // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
|
||||
__m128i vr = _mm_xor_si128( _mm_loadu_si128( (__m128i*)(mt + kk + M - N)), _mm_srli_epi32( vy, 1 ) ); // mt[kk+M-N] ^ (y >> 1)
|
||||
vr = _mm_xor_si128( vr, vmag01 ); // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
|
||||
_mm_store_si128( (__m128i*) (mt + kk ), vr );
|
||||
}
|
||||
#endif
|
||||
|
||||
for (;kk<N-1;kk++) {
|
||||
y = (cl_uint) ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK));
|
||||
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
|
||||
}
|
||||
y = (cl_uint)((mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK));
|
||||
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL];
|
||||
|
||||
#ifdef __SSE2__
|
||||
// Do the tempering ahead of time in vector code
|
||||
for( kk = 0; kk + 4 <= N; kk += 4 )
|
||||
{
|
||||
__m128i vy = _mm_load_si128( (__m128i*)(mt + kk ) ); // y = mt[k];
|
||||
vy = _mm_xor_si128( vy, _mm_srli_epi32( vy, 11 ) ); // y ^= (y >> 11);
|
||||
vy = _mm_xor_si128( vy, _mm_and_si128( _mm_slli_epi32( vy, 7 ), c0.v) ); // y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
|
||||
vy = _mm_xor_si128( vy, _mm_and_si128( _mm_slli_epi32( vy, 15 ), c1.v) ); // y ^= (y << 15) & (cl_uint) 0xefc60000UL;
|
||||
vy = _mm_xor_si128( vy, _mm_srli_epi32( vy, 18 ) ); // y ^= (y >> 18);
|
||||
_mm_store_si128( (__m128i*)(d->cache+kk), vy );
|
||||
}
|
||||
#endif
|
||||
|
||||
d->mti = 0;
|
||||
}
|
||||
#ifdef __SSE2__
|
||||
y = d->cache[d->mti++];
|
||||
#else
|
||||
y = mt[d->mti++];
|
||||
|
||||
/* Tempering */
|
||||
y ^= (y >> 11);
|
||||
y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
|
||||
y ^= (y << 15) & (cl_uint) 0xefc60000UL;
|
||||
y ^= (y >> 18);
|
||||
#endif
|
||||
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
cl_ulong genrand_int64( MTdata d)
|
||||
{
|
||||
return ((cl_ulong) genrand_int32(d) << 32) | (cl_uint) genrand_int32(d);
|
||||
}
|
||||
|
||||
/* generates a random number on [0,1]-real-interval */
|
||||
double genrand_real1(MTdata d)
|
||||
{
|
||||
return genrand_int32(d)*(1.0/4294967295.0);
|
||||
/* divided by 2^32-1 */
|
||||
}
|
||||
|
||||
/* generates a random number on [0,1)-real-interval */
|
||||
double genrand_real2(MTdata d)
|
||||
{
|
||||
return genrand_int32(d)*(1.0/4294967296.0);
|
||||
/* divided by 2^32 */
|
||||
}
|
||||
|
||||
/* generates a random number on (0,1)-real-interval */
|
||||
double genrand_real3(MTdata d)
|
||||
{
|
||||
return (((double)genrand_int32(d)) + 0.5)*(1.0/4294967296.0);
|
||||
/* divided by 2^32 */
|
||||
}
|
||||
|
||||
/* generates a random number on [0,1) with 53-bit resolution*/
|
||||
double genrand_res53(MTdata d)
|
||||
{
|
||||
unsigned long a=genrand_int32(d)>>5, b=genrand_int32(d)>>6;
|
||||
return(a*67108864.0+b)*(1.0/9007199254740992.0);
|
||||
}
|
||||
@@ -1,175 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "rounding_mode.h"
|
||||
|
||||
#if !(defined(_WIN32) && defined(_MSC_VER))
|
||||
RoundingMode set_round( RoundingMode r, Type outType )
|
||||
{
|
||||
static const int flt_rounds[ kRoundingModeCount ] = { FE_TONEAREST, FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO };
|
||||
static const int int_rounds[ kRoundingModeCount ] = { FE_TOWARDZERO, FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO };
|
||||
const int *p = int_rounds;
|
||||
if( outType == kfloat || outType == kdouble )
|
||||
p = flt_rounds;
|
||||
int oldRound = fegetround();
|
||||
fesetround( p[r] );
|
||||
|
||||
switch( oldRound )
|
||||
{
|
||||
case FE_TONEAREST:
|
||||
return kRoundToNearestEven;
|
||||
case FE_UPWARD:
|
||||
return kRoundUp;
|
||||
case FE_DOWNWARD:
|
||||
return kRoundDown;
|
||||
case FE_TOWARDZERO:
|
||||
return kRoundTowardZero;
|
||||
default:
|
||||
abort(); // ??!
|
||||
}
|
||||
return kDefaultRoundingMode; //never happens
|
||||
}
|
||||
|
||||
RoundingMode get_round( void )
|
||||
{
|
||||
int oldRound = fegetround();
|
||||
|
||||
switch( oldRound )
|
||||
{
|
||||
case FE_TONEAREST:
|
||||
return kRoundToNearestEven;
|
||||
case FE_UPWARD:
|
||||
return kRoundUp;
|
||||
case FE_DOWNWARD:
|
||||
return kRoundDown;
|
||||
case FE_TOWARDZERO:
|
||||
return kRoundTowardZero;
|
||||
}
|
||||
|
||||
return kDefaultRoundingMode;
|
||||
}
|
||||
|
||||
#else
|
||||
RoundingMode set_round( RoundingMode r, Type outType )
|
||||
{
|
||||
static const int flt_rounds[ kRoundingModeCount ] = { _RC_NEAR, _RC_NEAR, _RC_UP, _RC_DOWN, _RC_CHOP };
|
||||
static const int int_rounds[ kRoundingModeCount ] = { _RC_CHOP, _RC_NEAR, _RC_UP, _RC_DOWN, _RC_CHOP };
|
||||
const int *p = ( outType == kfloat || outType == kdouble )? flt_rounds : int_rounds;
|
||||
unsigned int oldRound;
|
||||
|
||||
int err = _controlfp_s(&oldRound, 0, 0); //get rounding mode into oldRound
|
||||
if (err) {
|
||||
vlog_error("\t\tERROR: -- cannot get rounding mode in %s:%d\n", __FILE__, __LINE__);
|
||||
return kDefaultRoundingMode; //what else never happens
|
||||
}
|
||||
|
||||
oldRound &= _MCW_RC;
|
||||
|
||||
RoundingMode old =
|
||||
(oldRound == _RC_NEAR)? kRoundToNearestEven :
|
||||
(oldRound == _RC_UP)? kRoundUp :
|
||||
(oldRound == _RC_DOWN)? kRoundDown :
|
||||
(oldRound == _RC_CHOP)? kRoundTowardZero:
|
||||
kDefaultRoundingMode;
|
||||
|
||||
_controlfp_s(&oldRound, p[r], _MCW_RC); //setting new rounding mode
|
||||
return old; //returning old rounding mode
|
||||
}
|
||||
|
||||
RoundingMode get_round( void )
|
||||
{
|
||||
unsigned int oldRound;
|
||||
|
||||
int err = _controlfp_s(&oldRound, 0, 0); //get rounding mode into oldRound
|
||||
oldRound &= _MCW_RC;
|
||||
return
|
||||
(oldRound == _RC_NEAR)? kRoundToNearestEven :
|
||||
(oldRound == _RC_UP)? kRoundUp :
|
||||
(oldRound == _RC_DOWN)? kRoundDown :
|
||||
(oldRound == _RC_CHOP)? kRoundTowardZero:
|
||||
kDefaultRoundingMode;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// FlushToZero() sets the host processor into ftz mode. It is intended to have a remote effect on the behavior of the code in
|
||||
// basic_test_conversions.c. Some host processors may not support this mode, which case you'll need to do some clamping in
|
||||
// software by testing against FLT_MIN or DBL_MIN in that file.
|
||||
//
|
||||
// Note: IEEE-754 says conversions are basic operations. As such they do *NOT* have the behavior in section 7.5.3 of
|
||||
// the OpenCL spec. They *ALWAYS* flush to zero for subnormal inputs or outputs when FTZ mode is on like other basic
|
||||
// operators do (e.g. add, subtract, multiply, divide, etc.)
|
||||
//
|
||||
// Configuring hardware to FTZ mode varies by platform.
|
||||
// CAUTION: Some C implementations may also fail to behave properly in this mode.
|
||||
//
|
||||
// On PowerPC, it is done by setting the FPSCR into non-IEEE mode.
|
||||
// On Intel, you can do this by turning on the FZ and DAZ bits in the MXCSR -- provided that SSE/SSE2
|
||||
// is used for floating point computation! If your OS uses x87, you'll need to figure out how
|
||||
// to turn that off for the conversions code in basic_test_conversions.c so that they flush to
|
||||
// zero properly. Otherwise, you'll need to add appropriate software clamping to basic_test_conversions.c
|
||||
// in which case, these function are at liberty to do nothing.
|
||||
//
|
||||
#if defined( __i386__ ) || defined( __x86_64__ ) || defined (_WIN32)
|
||||
#include <xmmintrin.h>
|
||||
#elif defined( __PPC__ )
|
||||
#include <fpu_control.h>
|
||||
#endif
|
||||
void *FlushToZero( void )
|
||||
{
|
||||
#if defined( __APPLE__ ) || defined(__linux__) || defined (_WIN32)
|
||||
#if defined( __i386__ ) || defined( __x86_64__ ) || defined(_MSC_VER)
|
||||
union{ int i; void *p; }u = { _mm_getcsr() };
|
||||
_mm_setcsr( u.i | 0x8040 );
|
||||
return u.p;
|
||||
#elif defined( __arm__ ) || defined(__aarch64__)
|
||||
// processor is already in FTZ mode -- do nothing
|
||||
return NULL;
|
||||
#elif defined( __PPC__ )
|
||||
fpu_control_t flags = 0;
|
||||
_FPU_GETCW(flags);
|
||||
flags |= _FPU_MASK_NI;
|
||||
_FPU_SETCW(flags);
|
||||
return NULL;
|
||||
#else
|
||||
#error Unknown arch
|
||||
#endif
|
||||
#else
|
||||
#error Please configure FlushToZero and UnFlushToZero to behave properly on this operating system.
|
||||
#endif
|
||||
}
|
||||
|
||||
// Undo the effects of FlushToZero above, restoring the host to default behavior, using the information passed in p.
|
||||
void UnFlushToZero( void *p)
|
||||
{
|
||||
#if defined( __APPLE__ ) || defined(__linux__) || defined (_WIN32)
|
||||
#if defined( __i386__ ) || defined( __x86_64__ ) || defined(_MSC_VER)
|
||||
union{ void *p; int i; }u = { p };
|
||||
_mm_setcsr( u.i );
|
||||
#elif defined( __arm__ ) || defined(__aarch64__)
|
||||
// processor is already in FTZ mode -- do nothing
|
||||
#elif defined( __PPC__)
|
||||
fpu_control_t flags = 0;
|
||||
_FPU_GETCW(flags);
|
||||
flags &= ~_FPU_MASK_NI;
|
||||
_FPU_SETCW(flags);
|
||||
#else
|
||||
#error Unknown arch
|
||||
#endif
|
||||
#else
|
||||
#error Please configure FlushToZero and UnFlushToZero to behave properly on this operating system.
|
||||
#endif
|
||||
}
|
||||
@@ -1,71 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef __ROUNDING_MODE_H__
|
||||
#define __ROUNDING_MODE_H__
|
||||
|
||||
#include "compat.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#if (defined(_WIN32) && defined (_MSC_VER))
|
||||
#include "errorHelpers.h"
|
||||
#include "testHarness.h"
|
||||
#endif
|
||||
|
||||
typedef enum
|
||||
{
|
||||
kDefaultRoundingMode = 0,
|
||||
kRoundToNearestEven,
|
||||
kRoundUp,
|
||||
kRoundDown,
|
||||
kRoundTowardZero,
|
||||
|
||||
kRoundingModeCount
|
||||
}RoundingMode;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
kuchar = 0,
|
||||
kchar = 1,
|
||||
kushort = 2,
|
||||
kshort = 3,
|
||||
kuint = 4,
|
||||
kint = 5,
|
||||
kfloat = 6,
|
||||
kdouble = 7,
|
||||
kulong = 8,
|
||||
klong = 9,
|
||||
|
||||
//This goes last
|
||||
kTypeCount
|
||||
}Type;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern RoundingMode set_round( RoundingMode r, Type outType );
|
||||
extern RoundingMode get_round( void );
|
||||
extern void *FlushToZero( void );
|
||||
extern void UnFlushToZero( void *p);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#endif /* __ROUNDING_MODE_H__ */
|
||||
@@ -1,106 +0,0 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "threadTesting.h"
|
||||
#include "errorHelpers.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if !defined(_WIN32)
|
||||
#include <stdbool.h>
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
#if !defined(_WIN32)
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
#if 0 // Disabed for now
|
||||
|
||||
typedef struct
|
||||
{
|
||||
basefn mFunction;
|
||||
cl_device_id mDevice;
|
||||
cl_context mContext;
|
||||
int mNumElements;
|
||||
} TestFnArgs;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Thread-based testing. Spawns a new thread to run the given test function,
|
||||
// then waits for it to complete. The entire idea is that, if the thread crashes,
|
||||
// we can catch it and report it as a failure instead of crashing the entire suite
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void *test_thread_wrapper( void *data )
|
||||
{
|
||||
TestFnArgs *args;
|
||||
int retVal;
|
||||
cl_context context;
|
||||
|
||||
args = (TestFnArgs *)data;
|
||||
|
||||
/* Create a new context to use (contexts can't cross threads) */
|
||||
context = clCreateContext(NULL, args->mDeviceGroup);
|
||||
if( context == NULL )
|
||||
{
|
||||
log_error("clCreateContext failed for new thread\n");
|
||||
return (void *)(-1);
|
||||
}
|
||||
|
||||
/* Call function */
|
||||
retVal = args->mFunction( args->mDeviceGroup, args->mDevice, context, args->mNumElements );
|
||||
|
||||
clReleaseContext( context );
|
||||
|
||||
return (void *)retVal;
|
||||
}
|
||||
|
||||
int test_threaded_function( basefn fnToTest, cl_device_id device, cl_context context, cl_command_queue queue, int numElements )
|
||||
{
|
||||
int error;
|
||||
pthread_t threadHdl;
|
||||
void *retVal;
|
||||
TestFnArgs args;
|
||||
|
||||
|
||||
args.mFunction = fnToTest;
|
||||
args.mDeviceGroup = deviceGroup;
|
||||
args.mDevice = device;
|
||||
args.mContext = context;
|
||||
args.mNumElements = numElements;
|
||||
|
||||
|
||||
error = pthread_create( &threadHdl, NULL, test_thread_wrapper, (void *)&args );
|
||||
if( error != 0 )
|
||||
{
|
||||
log_error( "ERROR: Unable to create thread for testing!\n" );
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Thread has been started, now just wait for it to complete (or crash) */
|
||||
error = pthread_join( threadHdl, &retVal );
|
||||
if( error != 0 )
|
||||
{
|
||||
log_error( "ERROR: Unable to join testing thread!\n" );
|
||||
return -1;
|
||||
}
|
||||
|
||||
return (int)((intptr_t)retVal);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -22,13 +22,13 @@ set(${MODULE_NAME}_SOURCES
|
||||
test_kernel_arg_info.c
|
||||
test_queue_properties.cpp
|
||||
../../test_common/harness/errorHelpers.c
|
||||
../../test_common/harness/threadTesting.c
|
||||
../../../../test_common/harness/threadTesting.c
|
||||
../../test_common/harness/testHarness.c
|
||||
../../test_common/harness/kernelHelpers.c
|
||||
../../../../test_common/harness/typeWrappers.cpp
|
||||
../../../../test_common/harness/conversions.c
|
||||
../../test_common/harness/mt19937.c
|
||||
../../test_common/harness/msvc9.c
|
||||
../../../../test_common/harness/mt19937.c
|
||||
../../../../test_common/harness/msvc9.c
|
||||
../../test_common/harness/imageHelpers.cpp
|
||||
)
|
||||
|
||||
|
||||
@@ -51,15 +51,15 @@ set(${MODULE_NAME}_SOURCES
|
||||
test_kernel_call_kernel_function.cpp
|
||||
test_local_kernel_scope.cpp
|
||||
../../test_common/harness/errorHelpers.c
|
||||
../../test_common/harness/threadTesting.c
|
||||
../../../../test_common/harness/threadTesting.c
|
||||
../../test_common/harness/testHarness.c
|
||||
../../test_common/harness/kernelHelpers.c
|
||||
../../../../test_common/harness/typeWrappers.cpp
|
||||
../../test_common/harness/imageHelpers.cpp
|
||||
../../test_common/harness/mt19937.c
|
||||
../../../../test_common/harness/mt19937.c
|
||||
../../../../test_common/harness/conversions.c
|
||||
../../test_common/harness/rounding_mode.c
|
||||
../../test_common/harness/msvc9.c
|
||||
../../../../test_common/harness/rounding_mode.c
|
||||
../../../../test_common/harness/msvc9.c
|
||||
)
|
||||
|
||||
include(../../../CMakeCommon.txt)
|
||||
|
||||
Reference in New Issue
Block a user