Support building for Windows on 64-bit Arm (#2355)

Support to build for Windows on Arm.
This commit is contained in:
Sreelakshmi Haridas Maruthur
2025-08-12 09:46:23 -06:00
committed by GitHub
parent 4115d04ae0
commit aef863afa2
9 changed files with 86 additions and 49 deletions

View File

@@ -10,7 +10,7 @@ jobs:
matrix: matrix:
build-type: [Release] build-type: [Release]
gl: [0] gl: [0]
os: [ubuntu-22.04, macos-latest, windows-latest] os: [ubuntu-22.04, macos-latest, windows-latest, windows-11-arm]
include: include:
- os: ubuntu-22.04 - os: ubuntu-22.04
gl: 1 gl: 1

View File

@@ -436,7 +436,14 @@ void *ThreadPool_WorkerFunc(void *p)
// drop run count to 0 // drop run count to 0
gRunCount = 0; gRunCount = 0;
#if defined(_M_IX86) || defined(_M_X64)
_mm_mfence(); _mm_mfence();
#elif defined(_M_ARM64)
__dmb(_ARM64_BARRIER_ISHST);
#else
#error Architecture needs an implementation
#endif
#else #else
if (pthread_mutex_lock(&gAtomicLock)) if (pthread_mutex_lock(&gAtomicLock))
log_error( log_error(
@@ -703,7 +710,13 @@ void ThreadPool_Exit(void)
// http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins // http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
__sync_synchronize(); __sync_synchronize();
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
#if defined(_M_IX86) || defined(_M_X64)
_mm_mfence(); _mm_mfence();
#elif defined(_M_ARM64)
__dmb(_ARM64_BARRIER_ISHST);
#else
#error Architecture needs an implementation
#endif
#else #else
#warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed #warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
#endif #endif

View File

@@ -23,10 +23,10 @@
#include <CL/cl_half.h> #include <CL/cl_half.h>
#if defined(__SSE__) || defined(_MSC_VER) #if defined(__SSE__) || _M_IX86_FP == 1
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
#if defined(__SSE2__) || defined(_MSC_VER) #if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
@@ -110,7 +110,7 @@ static long lrintf_clamped(float f)
volatile float x = f; volatile float x = f;
float magicVal = magic[f < 0]; float magicVal = magic[f < 0];
#if defined(__SSE__) || defined(_WIN32) #if defined(__SSE__) || _M_IX86_FP == 1
// Defeat x87 based arithmetic, which cant do FTZ, and will round this // Defeat x87 based arithmetic, which cant do FTZ, and will round this
// incorrectly // incorrectly
__m128 v = _mm_set_ss(x); __m128 v = _mm_set_ss(x);
@@ -150,7 +150,7 @@ static long lrint_clamped(double f)
{ {
volatile double x = f; volatile double x = f;
double magicVal = magic[f < 0]; double magicVal = magic[f < 0];
#if defined(__SSE2__) || (defined(_MSC_VER)) #if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64)
// Defeat x87 based arithmetic, which cant do FTZ, and will round this // Defeat x87 based arithmetic, which cant do FTZ, and will round this
// incorrectly // incorrectly
__m128d v = _mm_set_sd(x); __m128d v = _mm_set_sd(x);

View File

@@ -37,36 +37,44 @@ typedef int FPU_mode_type;
#else #else
typedef int64_t FPU_mode_type; typedef int64_t FPU_mode_type;
#endif #endif
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(__MINGW32__) || defined(_M_X64) || defined(__MINGW32__)
#include <xmmintrin.h> #include <xmmintrin.h>
#elif defined(_M_ARM64)
#include <intrin.h>
#elif defined(__PPC__) #elif defined(__PPC__)
#include <fpu_control.h> #include <fpu_control.h>
extern __thread fpu_control_t fpu_control; extern __thread fpu_control_t fpu_control;
#elif defined(__mips__) #elif defined(__mips__)
#include "mips/m32c1.h" #include "mips/m32c1.h"
#endif #endif
// Set the reference hardware floating point unit to FTZ mode // Set the reference hardware floating point unit to FTZ mode
inline void ForceFTZ(FPU_mode_type *mode) inline void ForceFTZ(FPU_mode_type *oldMode)
{ {
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(__MINGW32__) || defined(_M_X64) || defined(__MINGW32__)
*mode = _mm_getcsr(); *oldMode = _mm_getcsr();
_mm_setcsr(*mode | 0x8040); _mm_setcsr(*oldMode | 0x8040);
#elif defined(__PPC__) #elif defined(__PPC__)
*mode = fpu_control; *oldMode = fpu_control;
fpu_control |= _FPU_MASK_NI; fpu_control |= _FPU_MASK_NI;
#elif defined(__arm__) #elif defined(__arm__)
unsigned fpscr; unsigned fpscr;
__asm__ volatile("fmrx %0, fpscr" : "=r"(fpscr)); __asm__ volatile("fmrx %0, fpscr" : "=r"(fpscr));
*mode = fpscr; *oldMode = fpscr;
__asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr | (1U << 24))); __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr | (1U << 24)));
// Add 64 bit support // Add 64 bit support
#elif defined(__aarch64__) #elif defined(__aarch64__) // Clang
uint64_t fpscr; uint64_t fpscr;
__asm__ volatile("mrs %0, fpcr" : "=r"(fpscr)); __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr));
*mode = fpscr; *oldMode = fpscr;
__asm__ volatile("msr fpcr, %0" ::"r"(fpscr | (1U << 24))); __asm__ volatile("msr fpcr, %0" ::"r"(fpscr | (1U << 24)));
#elif defined(_M_ARM64) // Visual Studio
uint64_t fpscr;
fpscr = _ReadStatusReg(ARM64_FPSR);
*oldMode = fpscr;
_WriteStatusReg(ARM64_FPCR, fpscr | (1U << 24));
#elif defined(__mips__) #elif defined(__mips__)
fpa_bissr(FPA_CSR_FS); fpa_bissr(FPA_CSR_FS);
#else #else
@@ -75,26 +83,31 @@ inline void ForceFTZ(FPU_mode_type *mode)
} }
// Disable the denorm flush to zero // Disable the denorm flush to zero
inline void DisableFTZ(FPU_mode_type *mode) inline void DisableFTZ(FPU_mode_type *oldMode)
{ {
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(__MINGW32__) || defined(_M_X64) || defined(__MINGW32__)
*mode = _mm_getcsr(); *oldMode = _mm_getcsr();
_mm_setcsr(*mode & ~0x8040); _mm_setcsr(*oldMode & ~0x8040);
#elif defined(__PPC__) #elif defined(__PPC__)
*mode = fpu_control; *mode = fpu_control;
fpu_control &= ~_FPU_MASK_NI; fpu_control &= ~_FPU_MASK_NI;
#elif defined(__arm__) #elif defined(__arm__)
unsigned fpscr; unsigned fpscr;
__asm__ volatile("fmrx %0, fpscr" : "=r"(fpscr)); __asm__ volatile("fmrx %0, fpscr" : "=r"(fpscr));
*mode = fpscr; *oldMode = fpscr;
__asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr & ~(1U << 24))); __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr & ~(1U << 24)));
// Add 64 bit support // Add 64 bit support
#elif defined(__aarch64__) #elif defined(__aarch64__) // Clang
uint64_t fpscr; uint64_t fpscr;
__asm__ volatile("mrs %0, fpcr" : "=r"(fpscr)); __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr));
*mode = fpscr; *oldMode = fpscr;
__asm__ volatile("msr fpcr, %0" ::"r"(fpscr & ~(1U << 24))); __asm__ volatile("msr fpcr, %0" ::"r"(fpscr & ~(1U << 24)));
#elif defined(_M_ARM64) // Visual Studio
uint64_t fpscr;
fpscr = _ReadStatusReg(ARM64_FPSR);
*oldMode = fpscr;
_WriteStatusReg(ARM64_FPCR, fpscr & ~(1U << 24));
#elif defined(__mips__) #elif defined(__mips__)
fpa_bicsr(FPA_CSR_FS); fpa_bicsr(FPA_CSR_FS);
#else #else
@@ -105,16 +118,18 @@ inline void DisableFTZ(FPU_mode_type *mode)
// Restore the reference hardware to floating point state indicated by *mode // Restore the reference hardware to floating point state indicated by *mode
inline void RestoreFPState(FPU_mode_type *mode) inline void RestoreFPState(FPU_mode_type *mode)
{ {
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(__MINGW32__) || defined(_M_X64) || defined(__MINGW32__)
_mm_setcsr(*mode); _mm_setcsr(*mode);
#elif defined(__PPC__) #elif defined(__PPC__)
fpu_control = *mode; fpu_control = *mode;
#elif defined(__arm__) #elif defined(__arm__)
__asm__ volatile("fmxr fpscr, %0" ::"r"(*mode)); __asm__ volatile("fmxr fpscr, %0" ::"r"(*mode));
// Add 64 bit support // Add 64 bit support
#elif defined(__aarch64__) #elif defined(__aarch64__) // Clang
__asm__ volatile("msr fpcr, %0" ::"r"(*mode)); __asm__ volatile("msr fpcr, %0" ::"r"(*mode));
#elif defined(_M_ARM64) // Visual Studio
_WriteStatusReg(ARM64_FPCR, *mode);
#elif defined(__mips__) #elif defined(__mips__)
// Mips runs by default with DAZ=1 FTZ=1 // Mips runs by default with DAZ=1 FTZ=1
#else #else

View File

@@ -786,7 +786,9 @@ int __builtin_clz(unsigned int pattern)
#endif // !__has_builtin(__builtin_clz) #endif // !__has_builtin(__builtin_clz)
#include <intrin.h> #include <intrin.h>
#if !defined(_M_ARM64)
#include <emmintrin.h> #include <emmintrin.h>
#endif
int usleep(int usec) int usleep(int usec)
{ {

View File

@@ -193,7 +193,8 @@ RoundingMode get_round(void)
// basic_test_conversions.c in which case, these function are at // basic_test_conversions.c in which case, these function are at
// liberty to do nothing. // liberty to do nothing.
// //
#if defined(__i386__) || defined(__x86_64__) || defined(_WIN32) #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(_M_X64)
#include <xmmintrin.h> #include <xmmintrin.h>
#elif defined(__PPC__) #elif defined(__PPC__)
#include <fpu_control.h> #include <fpu_control.h>
@@ -203,18 +204,24 @@ RoundingMode get_round(void)
void *FlushToZero(void) void *FlushToZero(void)
{ {
#if defined(__APPLE__) || defined(__linux__) || defined(_WIN32) #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(_M_X64)
union { union {
unsigned int i; unsigned int i;
void *p; void *p;
} u = { _mm_getcsr() }; } u = { _mm_getcsr() };
_mm_setcsr(u.i | 0x8040); _mm_setcsr(u.i | 0x8040);
return u.p; return u.p;
#elif defined(__arm__) || defined(__aarch64__) #elif defined(__arm__) || defined(__aarch64__) // Clang
int64_t fpscr; int64_t fpscr;
_FPU_GETCW(fpscr); _FPU_GETCW(fpscr);
_FPU_SETCW(fpscr | FPSCR_FZ); _FPU_SETCW(fpscr | FPSCR_FZ);
return NULL; return NULL;
#elif defined(_M_ARM64) // Visual Studio
uint64_t fpscr;
fpscr = _ReadStatusReg(ARM64_FPSR);
_WriteStatusReg(ARM64_FPCR, fpscr | (1U << 24));
return NULL;
#elif defined(__PPC__) #elif defined(__PPC__)
fpu_control_t flags = 0; fpu_control_t flags = 0;
_FPU_GETCW(flags); _FPU_GETCW(flags);
@@ -237,16 +244,21 @@ void *FlushToZero(void)
void UnFlushToZero(void *p) void UnFlushToZero(void *p)
{ {
#if defined(__APPLE__) || defined(__linux__) || defined(_WIN32) #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(_M_X64)
union { union {
void *p; void *p;
unsigned int i; unsigned int i;
} u = { p }; } u = { p };
_mm_setcsr(u.i); _mm_setcsr(u.i);
#elif defined(__arm__) || defined(__aarch64__) #elif defined(__arm__) || defined(__aarch64__) // Clang
int64_t fpscr; int64_t fpscr;
_FPU_GETCW(fpscr); _FPU_GETCW(fpscr);
_FPU_SETCW(fpscr & ~FPSCR_FZ); _FPU_SETCW(fpscr & ~FPSCR_FZ);
#elif defined(_M_ARM64) // Visual Studio
uint64_t fpscr;
fpscr = _ReadStatusReg(ARM64_FPSR);
_WriteStatusReg(ARM64_FPCR, fpscr & ~(1U << 24));
#elif defined(__PPC__) #elif defined(__PPC__)
fpu_control_t flags = 0; fpu_control_t flags = 0;
_FPU_GETCW(flags); _FPU_GETCW(flags);

View File

@@ -53,17 +53,17 @@
#include "basic_test_conversions.h" #include "basic_test_conversions.h"
#if defined(_WIN32) #if defined(_M_IX86) || defined(_M_X64)
#include <mmintrin.h> #include <mmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
#else // !_WIN32 #else
#if defined(__SSE__) #if defined(__SSE__)
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
#if defined(__SSE2__) #if defined(__SSE2__)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#endif // _WIN32 #endif
cl_context gContext = NULL; cl_context gContext = NULL;
cl_command_queue gQueue = NULL; cl_command_queue gQueue = NULL;

View File

@@ -343,7 +343,7 @@ float DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int(float f)
volatile float x = f; volatile float x = f;
float magicVal = magic[f < 0]; float magicVal = magic[f < 0];
#if defined(__SSE__) #if defined(__SSE__) || _M_IX86_FP == 1
// Defeat x87 based arithmetic, which cant do FTZ, and will round this // Defeat x87 based arithmetic, which cant do FTZ, and will round this
// incorrectly // incorrectly
__m128 v = _mm_set_ss(x); __m128 v = _mm_set_ss(x);
@@ -376,7 +376,7 @@ DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int_and_clamp(double f)
{ {
volatile double x = f; volatile double x = f;
double magicVal = magic[f < 0]; double magicVal = magic[f < 0];
#if defined(__SSE2__) || defined(_MSC_VER) #if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64)
// Defeat x87 based arithmetic, which cant do FTZ, and will round this // Defeat x87 based arithmetic, which cant do FTZ, and will round this
// incorrectly // incorrectly
__m128d v = _mm_set_sd(x); __m128d v = _mm_set_sd(x);
@@ -479,7 +479,7 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
{ {
if (std::is_same<cl_double, OutType>::value) if (std::is_same<cl_double, OutType>::value)
{ {
#if defined(_MSC_VER) #if defined(_M_IX86) || defined(_M_X64)
double result; double result;
if (std::is_same<cl_ulong, InType>::value) if (std::is_same<cl_ulong, InType>::value)

View File

@@ -25,12 +25,10 @@
#include "utility.h" #include "utility.h"
#if defined(__SSE__) \ #if defined(__SSE__) || _M_IX86_FP == 1
|| (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
#if defined(__SSE2__) \ #if defined(__SSE2__) || _M_IX86_FP == 2 || defined(_M_X64)
|| (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
@@ -855,8 +853,7 @@ double reference_add(double x, double y)
volatile float a = (float)x; volatile float a = (float)x;
volatile float b = (float)y; volatile float b = (float)y;
#if defined(__SSE__) \ #if defined(__SSE__) || _M_IX86_FP == 1
|| (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
// defeat x87 // defeat x87
__m128 va = _mm_set_ss((float)a); __m128 va = _mm_set_ss((float)a);
__m128 vb = _mm_set_ss((float)b); __m128 vb = _mm_set_ss((float)b);
@@ -953,8 +950,7 @@ double reference_subtract(double x, double y)
{ {
volatile float a = (float)x; volatile float a = (float)x;
volatile float b = (float)y; volatile float b = (float)y;
#if defined(__SSE__) \ #if defined(__SSE__) || _M_IX86_FP == 1
|| (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
// defeat x87 // defeat x87
__m128 va = _mm_set_ss((float)a); __m128 va = _mm_set_ss((float)a);
__m128 vb = _mm_set_ss((float)b); __m128 vb = _mm_set_ss((float)b);
@@ -970,8 +966,7 @@ double reference_multiply(double x, double y)
{ {
volatile float a = (float)x; volatile float a = (float)x;
volatile float b = (float)y; volatile float b = (float)y;
#if defined(__SSE__) \ #if defined(__SSE__) || _M_IX86_FP == 1
|| (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)))
// defeat x87 // defeat x87
__m128 va = _mm_set_ss((float)a); __m128 va = _mm_set_ss((float)a);
__m128 vb = _mm_set_ss((float)b); __m128 vb = _mm_set_ss((float)b);