initial RISC-V support (#2614)

Unlike related PR #2344 that simply warns about unsupported FTZ, this PR
attempts to correctly handle FTZ on RISC-V.
RISC-V 'f' extension does not support any way to enable/disable flushing
subnormals to zero, implementations are required to always support
subnormals. Therefore this PR re-uses FTZ handling code from PPC, where
flushing also has to be explicitly performed.
This commit is contained in:
Michal Babej
2026-03-17 18:25:59 +02:00
committed by GitHub
parent 6506421614
commit 4e3f16b2b9
7 changed files with 32 additions and 18 deletions

View File

@@ -89,6 +89,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(CLConform_TARGET_ARCH x86_64)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*")
set(CLConform_TARGET_ARCH x86)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv.*")
set(CLConform_TARGET_ARCH RISCV)
endif()
if(NOT DEFINED CLConform_TARGET_ARCH)

View File

@@ -45,6 +45,9 @@ typedef int64_t FPU_mode_type;
#elif defined(__PPC__)
#include <fpu_control.h>
extern __thread fpu_control_t fpu_control;
#elif defined(__riscv)
#define _FPU_MASK_NI 1
static FPU_mode_type fpu_control;
#elif defined(__mips__)
#include "mips/m32c1.h"
#endif
@@ -56,7 +59,7 @@ inline void ForceFTZ(FPU_mode_type *oldMode)
|| defined(_M_X64) || defined(__MINGW32__)
*oldMode = _mm_getcsr();
_mm_setcsr(*oldMode | 0x8040);
#elif defined(__PPC__)
#elif defined(__PPC__) || defined(__riscv)
*oldMode = fpu_control;
fpu_control |= _FPU_MASK_NI;
#elif defined(__arm__)
@@ -89,8 +92,8 @@ inline void DisableFTZ(FPU_mode_type *oldMode)
|| defined(_M_X64) || defined(__MINGW32__)
*oldMode = _mm_getcsr();
_mm_setcsr(*oldMode & ~0x8040);
#elif defined(__PPC__)
*mode = fpu_control;
#elif defined(__PPC__) || defined(__riscv)
*oldMode = fpu_control;
fpu_control &= ~_FPU_MASK_NI;
#elif defined(__arm__)
unsigned fpscr;
@@ -121,7 +124,7 @@ inline void RestoreFPState(FPU_mode_type *mode)
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) \
|| defined(_M_X64) || defined(__MINGW32__)
_mm_setcsr(*mode);
#elif defined(__PPC__)
#elif defined(__PPC__) || defined(__riscv)
fpu_control = *mode;
#elif defined(__arm__)
__asm__ volatile("fmxr fpscr, %0" ::"r"(*mode));

View File

@@ -201,6 +201,7 @@ RoundingMode get_round(void)
#elif defined(__mips__)
#include "mips/m32c1.h"
#endif
void *FlushToZero(void)
{
#if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
@@ -231,6 +232,8 @@ void *FlushToZero(void)
#elif defined(__mips__)
fpa_bissr(FPA_CSR_FS);
return NULL;
#elif defined(__riscv)
return NULL;
#else
#error Unknown arch
#endif
@@ -266,6 +269,8 @@ void UnFlushToZero(void *p)
_FPU_SETCW(flags);
#elif defined(__mips__)
fpa_bicsr(FPA_CSR_FS);
#elif defined(__riscv)
return;
#else
#error Unknown arch
#endif

View File

@@ -1409,6 +1409,8 @@ void PrintArch(void)
vlog("ARCH:\tWindows\n");
#elif defined(__mips__)
vlog("ARCH:\tmips\n");
#elif defined(__riscv)
vlog("ARCH:\tRISC-V\n");
#else
#error unknown arch
#endif

View File

@@ -191,7 +191,7 @@ double sse_mul_sd(double x, double y)
}
#endif
#ifdef __PPC__
#if defined(__PPC__) || defined(__riscv)
float ppc_mul(float a, float b)
{
float p;
@@ -630,9 +630,11 @@ test_status InitCL( cl_device_id device )
// turn that off
f3[i] = sse_mul(q, q2);
f4[i] = sse_mul(-q, q2);
#elif defined(__PPC__)
// None of the current generation PPC processors support HW
// FTZ, emulate it in sw.
#elif (defined(__PPC__) || defined(__riscv))
// RISC-V CPUs with default 'f' fp32 extension do not support
// enabling/disabling FTZ mode, subnormals are always handled
// without FTZ. None of the current generation PPC processors
// support HW FTZ, emulate it in sw.
f3[i] = ppc_mul(q, q2);
f4[i] = ppc_mul(-q, q2);
#else
@@ -721,9 +723,10 @@ test_status InitCL( cl_device_id device )
skipTest[j][i] = (bufSkip[i] ||
(gSkipNanInf && (FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)))));
#if defined(__PPC__)
// Since the current Power processors don't emulate flush to zero in HW,
// it must be emulated in SW instead.
#if defined(__PPC__) || defined(__riscv)
// Since the current Power processors don't emulate flush to
// zero in HW, it must be emulated in SW instead. (same for
// RISC-V CPUs with 'f' extension)
if (gForceFTZ)
{
if ((fabsf(correct[j][i]) < FLT_MIN) && (correct[j][i] != 0.0f))
@@ -760,7 +763,6 @@ test_status InitCL( cl_device_id device )
}
}
double *f = (double*) buf1;
double *f2 = (double*) buf2;
double *f3 = (double*) buf3_double;

View File

@@ -120,8 +120,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
uint64_t GetTime(void);
void WriteInputBufferComplete(void *);
void *FlushToZero(void);
void UnFlushToZero(void *);
}
struct CalcRefValsBase

View File

@@ -859,7 +859,9 @@ double reference_add(double x, double y)
__m128 vb = _mm_set_ss((float)b);
va = _mm_add_ss(va, vb);
_mm_store_ss((float *)&a, va);
#elif defined(__PPC__)
#elif defined(__PPC__) || defined(__riscv)
// RISC-V CPUs with default 'f' fp32 extension do not support any way to
// enable/disable FTZ mode, subnormals are always handled without flushing.
// Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
// denorm's to zero. As such, the reference add with FTZ must be emulated in
// sw.
@@ -876,7 +878,7 @@ double reference_add(double x, double y)
} ub;
ub.d = b;
cl_uint mantA, mantB;
cl_ulong addendA, addendB, sum;
cl_ulong addendA, addendB;
int expA = extractf(a, &mantA);
int expB = extractf(b, &mantB);
cl_uint signA = ua.u & 0x80000000U;
@@ -972,7 +974,7 @@ double reference_multiply(double x, double y)
__m128 vb = _mm_set_ss((float)b);
va = _mm_mul_ss(va, vb);
_mm_store_ss((float *)&a, va);
#elif defined(__PPC__)
#elif defined(__PPC__) || defined(__riscv)
// Most Power host CPUs do not support the non-IEEE mode (NI) which flushes
// denorm's to zero. As such, the reference multiply with FTZ must be
// emulated in sw.
@@ -3351,7 +3353,7 @@ long double reference_cbrtl(long double x)
long double reference_rintl(long double x)
{
#if defined(__PPC__)
#if defined(__PPC__) || defined(__riscv)
// On PPC, long doubles are maintained as 2 doubles. Therefore, the combined
// mantissa can represent more than LDBL_MANT_DIG binary digits.
x = rintl(x);