diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 2debc6de..bfeb322b 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -10,7 +10,7 @@ jobs: matrix: build-type: [Release] gl: [0] - os: [ubuntu-22.04, macos-latest, windows-latest] + os: [ubuntu-22.04, macos-latest, windows-latest, windows-11-arm] include: - os: ubuntu-22.04 gl: 1 diff --git a/test_common/harness/ThreadPool.cpp b/test_common/harness/ThreadPool.cpp index fb1291d6..fab778c8 100644 --- a/test_common/harness/ThreadPool.cpp +++ b/test_common/harness/ThreadPool.cpp @@ -436,7 +436,14 @@ void *ThreadPool_WorkerFunc(void *p) // drop run count to 0 gRunCount = 0; +#if defined(_M_IX86) || defined(_M_X64) _mm_mfence(); +#elif defined(_M_ARM64) + __dmb(_ARM64_BARRIER_ISHST); +#else +#error Architecture needs an implementation +#endif + #else if (pthread_mutex_lock(&gAtomicLock)) log_error( @@ -703,7 +710,13 @@ void ThreadPool_Exit(void) // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins __sync_synchronize(); #elif defined(_MSC_VER) +#if defined(_M_IX86) || defined(_M_X64) _mm_mfence(); +#elif defined(_M_ARM64) + __dmb(_ARM64_BARRIER_ISHST); +#else +#error Architecture needs an implementation +#endif #else #warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed #endif diff --git a/test_common/harness/conversions.cpp b/test_common/harness/conversions.cpp index 18c2869d..e0e326ff 100644 --- a/test_common/harness/conversions.cpp +++ b/test_common/harness/conversions.cpp @@ -23,10 +23,10 @@ #include -#if defined(__SSE__) || defined(_MSC_VER) +#if defined(__SSE__) || _M_IX86_FP == 1 #include #endif -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64) #include #endif @@ -110,7 +110,7 @@ static long lrintf_clamped(float f) volatile float x = f; float magicVal = magic[f < 0]; -#if defined(__SSE__) || defined(_WIN32) +#if defined(__SSE__) || _M_IX86_FP == 1 // Defeat x87 based arithmetic, which cant do FTZ, and will round this // incorrectly __m128 v = _mm_set_ss(x); @@ -150,7 +150,7 @@ static long lrint_clamped(double f) { volatile double x = f; double magicVal = magic[f < 0]; -#if defined(__SSE2__) || (defined(_MSC_VER)) +#if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64) // Defeat x87 based arithmetic, which cant do FTZ, and will round this // incorrectly __m128d v = _mm_set_sd(x); diff --git a/test_common/harness/fpcontrol.h b/test_common/harness/fpcontrol.h index 12aba0a9..afb0f5a3 100644 --- a/test_common/harness/fpcontrol.h +++ b/test_common/harness/fpcontrol.h @@ -37,36 +37,44 @@ typedef int FPU_mode_type; #else typedef int64_t FPU_mode_type; #endif -#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ - || defined(__MINGW32__) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) || defined(__MINGW32__) #include +#elif defined(_M_ARM64) +#include #elif defined(__PPC__) #include extern __thread fpu_control_t fpu_control; #elif defined(__mips__) #include "mips/m32c1.h" #endif + // Set the reference hardware floating point unit to FTZ mode -inline void ForceFTZ(FPU_mode_type *mode) +inline void ForceFTZ(FPU_mode_type *oldMode) { -#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ - || defined(__MINGW32__) - *mode = _mm_getcsr(); - _mm_setcsr(*mode | 0x8040); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) || defined(__MINGW32__) + *oldMode = _mm_getcsr(); + _mm_setcsr(*oldMode | 0x8040); #elif defined(__PPC__) - *mode = fpu_control; + *oldMode = fpu_control; fpu_control |= _FPU_MASK_NI; #elif defined(__arm__) unsigned fpscr; __asm__ volatile("fmrx %0, fpscr" : "=r"(fpscr)); - *mode = fpscr; + *oldMode = fpscr; __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr | (1U << 24))); // Add 64 bit support -#elif defined(__aarch64__) +#elif defined(__aarch64__) // Clang uint64_t fpscr; __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr)); - *mode = fpscr; + *oldMode = fpscr; __asm__ volatile("msr fpcr, %0" ::"r"(fpscr | (1U << 24))); +#elif defined(_M_ARM64) // Visual Studio + uint64_t fpscr; + fpscr = _ReadStatusReg(ARM64_FPSR); + *oldMode = fpscr; + _WriteStatusReg(ARM64_FPCR, fpscr | (1U << 24)); #elif defined(__mips__) fpa_bissr(FPA_CSR_FS); #else @@ -75,26 +83,31 @@ inline void ForceFTZ(FPU_mode_type *mode) } // Disable the denorm flush to zero -inline void DisableFTZ(FPU_mode_type *mode) +inline void DisableFTZ(FPU_mode_type *oldMode) { -#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ - || defined(__MINGW32__) - *mode = _mm_getcsr(); - _mm_setcsr(*mode & ~0x8040); +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) || defined(__MINGW32__) + *oldMode = _mm_getcsr(); + _mm_setcsr(*oldMode & ~0x8040); #elif defined(__PPC__) *mode = fpu_control; fpu_control &= ~_FPU_MASK_NI; #elif defined(__arm__) unsigned fpscr; __asm__ volatile("fmrx %0, fpscr" : "=r"(fpscr)); - *mode = fpscr; + *oldMode = fpscr; __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr & ~(1U << 24))); // Add 64 bit support -#elif defined(__aarch64__) +#elif defined(__aarch64__) // Clang uint64_t fpscr; __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr)); - *mode = fpscr; + *oldMode = fpscr; __asm__ volatile("msr fpcr, %0" ::"r"(fpscr & ~(1U << 24))); +#elif defined(_M_ARM64) // Visual Studio + uint64_t fpscr; + fpscr = _ReadStatusReg(ARM64_FPSR); + *oldMode = fpscr; + _WriteStatusReg(ARM64_FPCR, fpscr & ~(1U << 24)); #elif defined(__mips__) fpa_bicsr(FPA_CSR_FS); #else @@ -105,16 +118,18 @@ inline void DisableFTZ(FPU_mode_type *mode) // Restore the reference hardware to floating point state indicated by *mode inline void RestoreFPState(FPU_mode_type *mode) { -#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ - || defined(__MINGW32__) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) || defined(__MINGW32__) _mm_setcsr(*mode); #elif defined(__PPC__) fpu_control = *mode; #elif defined(__arm__) __asm__ volatile("fmxr fpscr, %0" ::"r"(*mode)); // Add 64 bit support -#elif defined(__aarch64__) +#elif defined(__aarch64__) // Clang __asm__ volatile("msr fpcr, %0" ::"r"(*mode)); +#elif defined(_M_ARM64) // Visual Studio + _WriteStatusReg(ARM64_FPCR, *mode); #elif defined(__mips__) // Mips runs by default with DAZ=1 FTZ=1 #else @@ -125,4 +140,4 @@ inline void RestoreFPState(FPU_mode_type *mode) #error ForceFTZ and RestoreFPState need implentations #endif -#endif +#endif \ No newline at end of file diff --git a/test_common/harness/msvc9.c b/test_common/harness/msvc9.c index ef70035f..c0042928 100644 --- a/test_common/harness/msvc9.c +++ b/test_common/harness/msvc9.c @@ -786,7 +786,9 @@ int __builtin_clz(unsigned int pattern) #endif // !__has_builtin(__builtin_clz) #include +#if !defined(_M_ARM64) #include +#endif int usleep(int usec) { diff --git a/test_common/harness/rounding_mode.cpp b/test_common/harness/rounding_mode.cpp index b2e443b7..5aeb86f1 100644 --- a/test_common/harness/rounding_mode.cpp +++ b/test_common/harness/rounding_mode.cpp @@ -193,7 +193,8 @@ RoundingMode get_round(void) // basic_test_conversions.c in which case, these function are at // liberty to do nothing. // -#if defined(__i386__) || defined(__x86_64__) || defined(_WIN32) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) #include #elif defined(__PPC__) #include @@ -203,18 +204,24 @@ RoundingMode get_round(void) void *FlushToZero(void) { #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32) -#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) union { unsigned int i; void *p; } u = { _mm_getcsr() }; _mm_setcsr(u.i | 0x8040); return u.p; -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__arm__) || defined(__aarch64__) // Clang int64_t fpscr; _FPU_GETCW(fpscr); _FPU_SETCW(fpscr | FPSCR_FZ); return NULL; +#elif defined(_M_ARM64) // Visual Studio + uint64_t fpscr; + fpscr = _ReadStatusReg(ARM64_FPSR); + _WriteStatusReg(ARM64_FPCR, fpscr | (1U << 24)); + return NULL; #elif defined(__PPC__) fpu_control_t flags = 0; _FPU_GETCW(flags); @@ -237,16 +244,21 @@ void *FlushToZero(void) void UnFlushToZero(void *p) { #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32) -#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \ + || defined(_M_X64) union { void *p; unsigned int i; } u = { p }; _mm_setcsr(u.i); -#elif defined(__arm__) || defined(__aarch64__) +#elif defined(__arm__) || defined(__aarch64__) // Clang int64_t fpscr; _FPU_GETCW(fpscr); _FPU_SETCW(fpscr & ~FPSCR_FZ); +#elif defined(_M_ARM64) // Visual Studio + uint64_t fpscr; + fpscr = _ReadStatusReg(ARM64_FPSR); + _WriteStatusReg(ARM64_FPCR, fpscr & ~(1U << 24)); #elif defined(__PPC__) fpu_control_t flags = 0; _FPU_GETCW(flags); diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp index d4f6d366..4692c4b4 100644 --- a/test_conformance/conversions/basic_test_conversions.cpp +++ b/test_conformance/conversions/basic_test_conversions.cpp @@ -53,17 +53,17 @@ #include "basic_test_conversions.h" -#if defined(_WIN32) +#if defined(_M_IX86) || defined(_M_X64) #include #include -#else // !_WIN32 +#else #if defined(__SSE__) #include #endif #if defined(__SSE2__) #include #endif -#endif // _WIN32 +#endif cl_context gContext = NULL; cl_command_queue gQueue = NULL; diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h index 9d2cbc60..46eb9c23 100644 --- a/test_conformance/conversions/conversions_data_info.h +++ b/test_conformance/conversions/conversions_data_info.h @@ -343,7 +343,7 @@ float DataInfoSpec::round_to_int(float f) volatile float x = f; float magicVal = magic[f < 0]; -#if defined(__SSE__) +#if defined(__SSE__) || _M_IX86_FP == 1 // Defeat x87 based arithmetic, which cant do FTZ, and will round this // incorrectly __m128 v = _mm_set_ss(x); @@ -376,7 +376,7 @@ DataInfoSpec::round_to_int_and_clamp(double f) { volatile double x = f; double magicVal = magic[f < 0]; -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64) // Defeat x87 based arithmetic, which cant do FTZ, and will round this // incorrectly __m128d v = _mm_set_sd(x); @@ -479,7 +479,7 @@ void DataInfoSpec::conv(OutType *out, InType *in) { if (std::is_same::value) { -#if defined(_MSC_VER) +#if defined(_M_IX86) || defined(_M_X64) double result; if (std::is_same::value) diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index 45dd6526..a66e6f7e 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -25,12 +25,10 @@ #include "utility.h" -#if defined(__SSE__) \ - || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) +#if defined(__SSE__) || _M_IX86_FP == 1 #include #endif -#if defined(__SSE2__) \ - || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) +#if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64) #include #endif @@ -855,8 +853,7 @@ double reference_add(double x, double y) volatile float a = (float)x; volatile float b = (float)y; -#if defined(__SSE__) \ - || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) +#if defined(__SSE__) || _M_IX86_FP == 1 // defeat x87 __m128 va = _mm_set_ss((float)a); __m128 vb = _mm_set_ss((float)b); @@ -953,8 +950,7 @@ double reference_subtract(double x, double y) { volatile float a = (float)x; volatile float b = (float)y; -#if defined(__SSE__) \ - || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) +#if defined(__SSE__) || _M_IX86_FP == 1 // defeat x87 __m128 va = _mm_set_ss((float)a); __m128 vb = _mm_set_ss((float)b); @@ -970,8 +966,7 @@ double reference_multiply(double x, double y) { volatile float a = (float)x; volatile float b = (float)y; -#if defined(__SSE__) \ - || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) +#if defined(__SSE__) || _M_IX86_FP == 1 // defeat x87 __m128 va = _mm_set_ss((float)a); __m128 vb = _mm_set_ss((float)b);