mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
math_brute_force: treat reciprocal as unary function (#2281)
Treat reciprocal as a unary function, instead of handling it through the
binary function testing mechanism and special-casing it there.
This addresses two shortcomings of the previous implementation:
- Testing took significantly longer as the entire input domain was
tested many times (e.g. fp16 reciprocal has only 2^16 possible input
values, but binary function testing iterates over 2^16 * 2^16 input
values).
- The reciprocal test kernel was identical to the divide kernel. Thus
the device compiler would see a regular divide operation instead of a
reciprocal operation and would be unlikely to emit a specialized
reciprocal sequence.
This reverts all of the changes in binary_operator*.cpp made by
bcfa1f7c2 ("Added corrections to re-enable reciprocal test in
math_brute_force suite for relaxed math mode (#2221)", 2025-02-04).
Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
This commit is contained in:
committed by
GitHub
parent
5167d7202b
commit
7feb93cdd7
@@ -214,12 +214,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
cl_double *s;
|
||||
cl_double *s2;
|
||||
|
||||
bool reciprocal = strcmp(name, "reciprocal") == 0;
|
||||
const double reciprocalArrayX[] = { 1.0 };
|
||||
const double *specialValuesX =
|
||||
reciprocal ? reciprocalArrayX : specialValues;
|
||||
size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;
|
||||
|
||||
Force64BitFPUPrecision();
|
||||
|
||||
cl_event e[VECTOR_SIZE_COUNT];
|
||||
@@ -248,7 +242,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
|
||||
cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
|
||||
cl_uint idx = 0;
|
||||
int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
|
||||
int totalSpecialValueCount = specialValuesCount * specialValuesCount;
|
||||
int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
|
||||
|
||||
// Test edge cases
|
||||
@@ -258,15 +252,14 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
cl_double *fp2 = (cl_double *)p2;
|
||||
uint32_t x, y;
|
||||
|
||||
x = (job_id * buffer_elements) % specialValuesCountX;
|
||||
x = (job_id * buffer_elements) % specialValuesCount;
|
||||
y = (job_id * buffer_elements) / specialValuesCount;
|
||||
|
||||
for (; idx < buffer_elements; idx++)
|
||||
{
|
||||
fp[idx] = specialValuesX[x];
|
||||
fp[idx] = specialValues[x];
|
||||
fp2[idx] = specialValues[y];
|
||||
++x;
|
||||
if (x >= specialValuesCountX)
|
||||
if (++x >= specialValuesCount)
|
||||
{
|
||||
x = 0;
|
||||
y++;
|
||||
@@ -278,8 +271,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
// Init any remaining values
|
||||
for (; idx < buffer_elements; idx++)
|
||||
{
|
||||
p[idx] =
|
||||
reciprocal ? ((cl_ulong *)specialValuesX)[0] : genrand_int64(d);
|
||||
p[idx] = genrand_int64(d);
|
||||
p2[idx] = genrand_int64(d);
|
||||
}
|
||||
|
||||
@@ -372,13 +364,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
|
||||
s = (cl_double *)gIn + thread_id * buffer_elements;
|
||||
s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
|
||||
|
||||
if (reciprocal)
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
r[j] = (float)func.f_f(s2[j]);
|
||||
else
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
r[j] = (cl_double)func.f_ff(s[j], s2[j]);
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
r[j] = (cl_double)func.f_ff(s[j], s2[j]);
|
||||
|
||||
// Read the data back -- no need to wait for the first N-1 buffers but wait
|
||||
// for the last buffer. This is an in order queue.
|
||||
@@ -408,9 +395,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
if (t[j] != q[j])
|
||||
{
|
||||
cl_double test = ((cl_double *)q)[j];
|
||||
long double correct =
|
||||
reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
|
||||
|
||||
long double correct = func.f_ff(s[j], s2[j]);
|
||||
float err = Bruteforce_Ulp_Error_Double(test, correct);
|
||||
int fail = !(fabsf(err) <= ulps);
|
||||
|
||||
@@ -483,11 +468,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
}
|
||||
else if (IsDoubleSubnormal(s2[j]))
|
||||
{
|
||||
long double correct2 =
|
||||
reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
|
||||
long double correct3 =
|
||||
reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
|
||||
|
||||
long double correct2 = func.f_ff(s[j], 0.0);
|
||||
long double correct3 = func.f_ff(s[j], -0.0);
|
||||
float err2 =
|
||||
Bruteforce_Ulp_Error_Double(test, correct2);
|
||||
float err3 =
|
||||
|
||||
@@ -208,11 +208,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
cl_float *s2 = 0;
|
||||
RoundingMode oldRoundMode;
|
||||
|
||||
bool reciprocal = strcmp(name, "reciprocal") == 0;
|
||||
const float reciprocalArrayX[] = { 1.f };
|
||||
const float *specialValuesX = reciprocal ? reciprocalArrayX : specialValues;
|
||||
size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;
|
||||
|
||||
if (relaxedMode)
|
||||
{
|
||||
func = job->f->rfunc;
|
||||
@@ -244,7 +239,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
|
||||
cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
|
||||
cl_uint idx = 0;
|
||||
int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
|
||||
int totalSpecialValueCount = specialValuesCount * specialValuesCount;
|
||||
int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
|
||||
|
||||
if (job_id <= (cl_uint)lastSpecialJobIndex)
|
||||
@@ -252,15 +247,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
// Insert special values
|
||||
uint32_t x, y;
|
||||
|
||||
x = (job_id * buffer_elements) % specialValuesCountX;
|
||||
x = (job_id * buffer_elements) % specialValuesCount;
|
||||
y = (job_id * buffer_elements) / specialValuesCount;
|
||||
|
||||
for (; idx < buffer_elements; idx++)
|
||||
{
|
||||
p[idx] = ((cl_uint *)specialValuesX)[x];
|
||||
p[idx] = ((cl_uint *)specialValues)[x];
|
||||
p2[idx] = ((cl_uint *)specialValues)[y];
|
||||
++x;
|
||||
if (x >= specialValuesCountX)
|
||||
if (x >= specialValuesCount)
|
||||
{
|
||||
x = 0;
|
||||
y++;
|
||||
@@ -274,19 +269,13 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
|
||||
if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
|
||||
}
|
||||
else if (relaxedMode && reciprocal)
|
||||
{
|
||||
cl_uint p2j = p2[idx] & 0x7fffffff;
|
||||
// Replace values outside [2^-126, 2^126] with QNaN
|
||||
if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Init any remaining values
|
||||
for (; idx < buffer_elements; idx++)
|
||||
{
|
||||
p[idx] = reciprocal ? ((cl_uint *)specialValuesX)[0] : genrand_int32(d);
|
||||
p[idx] = genrand_int32(d);
|
||||
p2[idx] = genrand_int32(d);
|
||||
|
||||
if (relaxedMode && strcmp(name, "divide") == 0)
|
||||
@@ -297,12 +286,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
|
||||
if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
|
||||
}
|
||||
else if (relaxedMode && reciprocal)
|
||||
{
|
||||
cl_uint p2j = p2[idx] & 0x7fffffff;
|
||||
// Replace values outside [2^-126, 2^126] with QNaN
|
||||
if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
|
||||
}
|
||||
}
|
||||
|
||||
if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
|
||||
@@ -408,31 +391,18 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
s2 = (float *)gIn2 + thread_id * buffer_elements;
|
||||
if (gInfNanSupport)
|
||||
{
|
||||
if (reciprocal)
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
r[j] = (float)func.f_f(s2[j]);
|
||||
else
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
r[j] = (float)func.f_ff(s[j], s2[j]);
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
r[j] = (float)func.f_ff(s[j], s2[j]);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (reciprocal)
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
{
|
||||
feclearexcept(FE_OVERFLOW);
|
||||
r[j] = (float)func.f_f(s2[j]);
|
||||
overflow[j] =
|
||||
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
|
||||
}
|
||||
else
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
{
|
||||
feclearexcept(FE_OVERFLOW);
|
||||
r[j] = (float)func.f_ff(s[j], s2[j]);
|
||||
overflow[j] =
|
||||
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
|
||||
}
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
{
|
||||
feclearexcept(FE_OVERFLOW);
|
||||
r[j] = (float)func.f_ff(s[j], s2[j]);
|
||||
overflow[j] =
|
||||
FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
|
||||
}
|
||||
}
|
||||
|
||||
if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
|
||||
@@ -467,8 +437,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
if (t[j] != q[j])
|
||||
{
|
||||
float test = ((float *)q)[j];
|
||||
double correct =
|
||||
reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
|
||||
double correct = func.f_ff(s[j], s2[j]);
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an input or
|
||||
// output is a infinity or NaN or overflow
|
||||
@@ -505,7 +474,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
}
|
||||
|
||||
// retry per section 6.5.3.3
|
||||
if (!reciprocal && IsFloatSubnormal(s[j]))
|
||||
if (IsFloatSubnormal(s[j]))
|
||||
{
|
||||
double correct2, correct3;
|
||||
float err2, err3;
|
||||
@@ -611,10 +580,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
|
||||
if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
|
||||
|
||||
correct2 =
|
||||
reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
|
||||
correct3 =
|
||||
reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
|
||||
correct2 = func.f_ff(s[j], 0.0);
|
||||
correct3 = func.f_ff(s[j], -0.0);
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an
|
||||
// input or output is a infinity or NaN or overflow
|
||||
@@ -647,6 +614,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (fabsf(err) > tinfo->maxError)
|
||||
{
|
||||
tinfo->maxError = fabsf(err);
|
||||
|
||||
@@ -120,12 +120,6 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
std::vector<float> s(0), s2(0);
|
||||
RoundingMode oldRoundMode;
|
||||
|
||||
bool reciprocal = strcmp(name, "reciprocal") == 0;
|
||||
const cl_half reciprocalArrayHalfX[] = { 0x3c00 };
|
||||
const cl_half *specialValuesHalfX =
|
||||
reciprocal ? reciprocalArrayHalfX : specialValuesHalf;
|
||||
size_t specialValuesHalfCountX = reciprocal ? 1 : specialValuesHalfCount;
|
||||
|
||||
cl_event e[VECTOR_SIZE_COUNT];
|
||||
cl_half *out[VECTOR_SIZE_COUNT];
|
||||
|
||||
@@ -154,7 +148,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
|
||||
cl_uint idx = 0;
|
||||
int totalSpecialValueCount =
|
||||
specialValuesHalfCountX * specialValuesHalfCount;
|
||||
specialValuesHalfCount * specialValuesHalfCount;
|
||||
int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
|
||||
|
||||
if (job_id <= (cl_uint)lastSpecialJobIndex)
|
||||
@@ -162,15 +156,14 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
// Insert special values
|
||||
uint32_t x, y;
|
||||
|
||||
x = (job_id * buffer_elements) % specialValuesHalfCountX;
|
||||
x = (job_id * buffer_elements) % specialValuesHalfCount;
|
||||
y = (job_id * buffer_elements) / specialValuesHalfCount;
|
||||
|
||||
for (; idx < buffer_elements; idx++)
|
||||
{
|
||||
p[idx] = specialValuesHalfX[x];
|
||||
p[idx] = specialValuesHalf[x];
|
||||
p2[idx] = specialValuesHalf[y];
|
||||
++x;
|
||||
if (x >= specialValuesHalfCountX)
|
||||
if (++x >= specialValuesHalfCount)
|
||||
{
|
||||
x = 0;
|
||||
y++;
|
||||
@@ -182,8 +175,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
// Init any remaining values
|
||||
for (; idx < buffer_elements; idx++)
|
||||
{
|
||||
p[idx] = reciprocal ? ((cl_half *)specialValuesHalfX)[0]
|
||||
: (cl_half)genrand_int32(d);
|
||||
p[idx] = (cl_half)genrand_int32(d);
|
||||
p2[idx] = (cl_half)genrand_int32(d);
|
||||
}
|
||||
if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
|
||||
@@ -280,23 +272,11 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
s.resize(buffer_elements);
|
||||
s2.resize(buffer_elements);
|
||||
|
||||
if (reciprocal)
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
{
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
{
|
||||
s[j] = HTF(p[j]);
|
||||
s2[j] = HTF(p2[j]);
|
||||
r[j] = HFF(func.f_f(s2[j]));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t j = 0; j < buffer_elements; j++)
|
||||
{
|
||||
s[j] = HTF(p[j]);
|
||||
s2[j] = HTF(p2[j]);
|
||||
r[j] = HFF(func.f_ff(s[j], s2[j]));
|
||||
}
|
||||
s[j] = HTF(p[j]);
|
||||
s2[j] = HTF(p2[j]);
|
||||
r[j] = HFF(func.f_ff(s[j], s2[j]));
|
||||
}
|
||||
|
||||
if (ftz) RestoreFPState(&oldMode);
|
||||
@@ -329,8 +309,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
if (r[j] != q[j])
|
||||
{
|
||||
float test = HTF(q[j]);
|
||||
float correct =
|
||||
reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
|
||||
float correct = func.f_ff(s[j], s2[j]);
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an input or
|
||||
// output is a infinity or NaN or overflow
|
||||
@@ -456,10 +435,9 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
|
||||
double correct2, correct3;
|
||||
float err2, err3;
|
||||
|
||||
correct2 =
|
||||
reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
|
||||
correct3 =
|
||||
reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
|
||||
correct2 = func.f_ff(s[j], 0.0);
|
||||
correct3 = func.f_ff(s[j], -0.0);
|
||||
|
||||
|
||||
// Per section 10 paragraph 6, accept any result if an
|
||||
// input or output is a infinity or NaN or overflow
|
||||
|
||||
@@ -427,9 +427,8 @@ const Func functionList[] = {
|
||||
// basic operations
|
||||
OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
|
||||
OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
|
||||
//ENTRY(reciprocal, 1.0f, 1.0f, FTZ_OFF, unaryF),
|
||||
{ "reciprocal",
|
||||
"/",
|
||||
"reciprocal",
|
||||
{ (void*)reference_reciprocal },
|
||||
{ (void*)reference_reciprocall },
|
||||
{ (void*)reference_relaxed_reciprocal },
|
||||
@@ -442,7 +441,7 @@ const Func functionList[] = {
|
||||
INFINITY,
|
||||
FTZ_OFF,
|
||||
RELAXED_ON,
|
||||
binaryOperatorF },
|
||||
unaryF},
|
||||
{ "divide",
|
||||
"/",
|
||||
{ (void*)reference_divide },
|
||||
|
||||
@@ -29,7 +29,12 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
|
||||
BuildKernelInfo &info = *(BuildKernelInfo *)p;
|
||||
auto generator = [](const std::string &kernel_name, const char *builtin,
|
||||
cl_uint vector_size_index) {
|
||||
return GetUnaryKernel(kernel_name, builtin, ParameterType::Double,
|
||||
const char *builtinCall = builtin;
|
||||
if (strcmp(builtin, "reciprocal") == 0)
|
||||
{
|
||||
builtinCall = "((RETTYPE)(1.0))/";
|
||||
}
|
||||
return GetUnaryKernel(kernel_name, builtinCall, ParameterType::Double,
|
||||
ParameterType::Double, vector_size_index);
|
||||
};
|
||||
return BuildKernels(info, job_id, generator);
|
||||
|
||||
@@ -28,7 +28,12 @@ cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
|
||||
BuildKernelInfo &info = *(BuildKernelInfo *)p;
|
||||
auto generator = [](const std::string &kernel_name, const char *builtin,
|
||||
cl_uint vector_size_index) {
|
||||
return GetUnaryKernel(kernel_name, builtin, ParameterType::Float,
|
||||
const char *builtinCall = builtin;
|
||||
if (strcmp(builtin, "reciprocal") == 0)
|
||||
{
|
||||
builtinCall = "((RETTYPE)(1.0f))/";
|
||||
}
|
||||
return GetUnaryKernel(kernel_name, builtinCall, ParameterType::Float,
|
||||
ParameterType::Float, vector_size_index);
|
||||
};
|
||||
return BuildKernels(info, job_id, generator);
|
||||
|
||||
@@ -28,7 +28,12 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
|
||||
BuildKernelInfo &info = *(BuildKernelInfo *)p;
|
||||
auto generator = [](const std::string &kernel_name, const char *builtin,
|
||||
cl_uint vector_size_index) {
|
||||
return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
|
||||
const char *builtinCall = builtin;
|
||||
if (strcmp(builtin, "reciprocal") == 0)
|
||||
{
|
||||
builtinCall = "((RETTYPE)(1.0h))/";
|
||||
}
|
||||
return GetUnaryKernel(kernel_name, builtinCall, ParameterType::Half,
|
||||
ParameterType::Half, vector_size_index);
|
||||
};
|
||||
return BuildKernels(info, job_id, generator);
|
||||
|
||||
Reference in New Issue
Block a user