mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
Simplify code by returning directly instead of using goto statements. Although intended as an NFC commit, this changes the behaviour around clFlush calls. Before this commit, failure of the third clFlush call would print "clFlush 3 failed" and return the clFlush error code. This behaviour is inconsistent with the other clFlush calls in math_brute_force, which are not fatal. The lack of a `goto exit` makes me suspect that this 3rd clFlush call was intended to be non-fatal too. As such, this commit makes all clFlush calls non-fatal by returning `CL_SUCCESS` even when the third clFlush call fails. Original patch by Marco Antognini. Signed-off-by: Marco Antognini <marco.antognini@arm.com> Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
442 lines
15 KiB
C++
442 lines
15 KiB
C++
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
#include "common.h"
|
|
#include "function_list.h"
|
|
#include "test_functions.h"
|
|
#include "utility.h"
|
|
|
|
#include <cinttypes>
|
|
#include <cstring>
|
|
|
|
namespace {
|
|
|
|
cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
|
|
{
|
|
BuildKernelInfo &info = *(BuildKernelInfo *)p;
|
|
auto generator = [](const std::string &kernel_name, const char *builtin,
|
|
cl_uint vector_size_index) {
|
|
return GetUnaryKernel(kernel_name, builtin, ParameterType::Double,
|
|
ParameterType::Double, vector_size_index);
|
|
};
|
|
return BuildKernels(info, job_id, generator);
|
|
}
|
|
|
|
// Thread specific data for a worker thread
|
|
struct ThreadInfo
|
|
{
|
|
// Input and output buffers for the thread
|
|
clMemWrapper inBuf;
|
|
Buffers outBuf;
|
|
|
|
float maxError; // max error value. Init to 0.
|
|
double maxErrorValue; // position of the max error value. Init to 0.
|
|
|
|
// Per thread command queue to improve performance
|
|
clCommandQueueWrapper tQueue;
|
|
};
|
|
|
|
struct TestInfo
|
|
{
|
|
size_t subBufferSize; // Size of the sub-buffer in elements
|
|
const Func *f; // A pointer to the function info
|
|
|
|
// Programs for various vector sizes.
|
|
Programs programs;
|
|
|
|
// Thread-specific kernels for each vector size:
|
|
// k[vector_size][thread_id]
|
|
KernelMatrix k;
|
|
|
|
// Array of thread specific information
|
|
std::vector<ThreadInfo> tinfo;
|
|
|
|
cl_uint threadCount; // Number of worker threads
|
|
cl_uint jobCount; // Number of jobs
|
|
cl_uint step; // step between each chunk and the next.
|
|
cl_uint scale; // stride between individual test values
|
|
float ulps; // max_allowed ulps
|
|
int ftz; // non-zero if running in flush to zero mode
|
|
|
|
int isRangeLimited; // 1 if the function is only to be evaluated over a
|
|
// range
|
|
float half_sin_cos_tan_limit;
|
|
bool relaxedMode; // True if test is running in relaxed mode, false
|
|
// otherwise.
|
|
};
|
|
|
|
cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
|
|
{
|
|
TestInfo *job = (TestInfo *)data;
|
|
size_t buffer_elements = job->subBufferSize;
|
|
size_t buffer_size = buffer_elements * sizeof(cl_double);
|
|
cl_uint scale = job->scale;
|
|
cl_uint base = job_id * (cl_uint)job->step;
|
|
ThreadInfo *tinfo = &(job->tinfo[thread_id]);
|
|
float ulps = job->ulps;
|
|
dptr func = job->f->dfunc;
|
|
cl_int error;
|
|
int ftz = job->ftz;
|
|
bool relaxedMode = job->relaxedMode;
|
|
|
|
Force64BitFPUPrecision();
|
|
|
|
cl_event e[VECTOR_SIZE_COUNT];
|
|
cl_ulong *out[VECTOR_SIZE_COUNT];
|
|
if (gHostFill)
|
|
{
|
|
// start the map of the output arrays
|
|
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
|
{
|
|
out[j] = (cl_ulong *)clEnqueueMapBuffer(
|
|
tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
|
|
buffer_size, 0, NULL, e + j, &error);
|
|
if (error || NULL == out[j])
|
|
{
|
|
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
|
|
error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
// Get that moving
|
|
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
|
|
}
|
|
|
|
// Write the new values to the input array
|
|
cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
|
|
for (size_t j = 0; j < buffer_elements; j++)
|
|
p[j] = DoubleFromUInt32(base + j * scale);
|
|
|
|
if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
|
|
buffer_size, p, 0, NULL, NULL)))
|
|
{
|
|
vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
|
|
return error;
|
|
}
|
|
|
|
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
|
{
|
|
if (gHostFill)
|
|
{
|
|
// Wait for the map to finish
|
|
if ((error = clWaitForEvents(1, e + j)))
|
|
{
|
|
vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
|
|
return error;
|
|
}
|
|
if ((error = clReleaseEvent(e[j])))
|
|
{
|
|
vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
// Fill the result buffer with garbage, so that old results don't carry
|
|
// over
|
|
uint32_t pattern = 0xffffdead;
|
|
if (gHostFill)
|
|
{
|
|
memset_pattern4(out[j], &pattern, buffer_size);
|
|
if ((error = clEnqueueUnmapMemObject(
|
|
tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
|
|
{
|
|
vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
|
|
error);
|
|
return error;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
|
|
&pattern, sizeof(pattern), 0,
|
|
buffer_size, 0, NULL, NULL)))
|
|
{
|
|
vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
|
|
error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
// Run the kernel
|
|
size_t vectorCount =
|
|
(buffer_elements + sizeValues[j] - 1) / sizeValues[j];
|
|
cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
|
|
// own copy of the cl_kernel
|
|
cl_program program = job->programs[j];
|
|
|
|
if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
|
|
&tinfo->outBuf[j])))
|
|
{
|
|
LogBuildError(program);
|
|
return error;
|
|
}
|
|
if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
|
|
&tinfo->inBuf)))
|
|
{
|
|
LogBuildError(program);
|
|
return error;
|
|
}
|
|
|
|
if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
|
|
&vectorCount, NULL, 0, NULL, NULL)))
|
|
{
|
|
vlog_error("FAILED -- could not execute kernel\n");
|
|
return error;
|
|
}
|
|
}
|
|
|
|
|
|
// Get that moving
|
|
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
|
|
|
|
if (gSkipCorrectnessTesting) return CL_SUCCESS;
|
|
|
|
// Calculate the correctly rounded reference result
|
|
cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
|
|
cl_double *s = (cl_double *)p;
|
|
for (size_t j = 0; j < buffer_elements; j++)
|
|
r[j] = (cl_double)func.f_f(s[j]);
|
|
|
|
// Read the data back -- no need to wait for the first N-1 buffers but wait
|
|
// for the last buffer. This is an in order queue.
|
|
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
|
{
|
|
cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
|
|
out[j] = (cl_ulong *)clEnqueueMapBuffer(
|
|
tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
|
|
buffer_size, 0, NULL, NULL, &error);
|
|
if (error || NULL == out[j])
|
|
{
|
|
vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
|
|
error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
// Verify data
|
|
cl_ulong *t = (cl_ulong *)r;
|
|
for (size_t j = 0; j < buffer_elements; j++)
|
|
{
|
|
for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
|
|
{
|
|
cl_ulong *q = out[k];
|
|
|
|
// If we aren't getting the correctly rounded result
|
|
if (t[j] != q[j])
|
|
{
|
|
cl_double test = ((cl_double *)q)[j];
|
|
long double correct = func.f_f(s[j]);
|
|
float err = Bruteforce_Ulp_Error_Double(test, correct);
|
|
int fail = !(fabsf(err) <= ulps);
|
|
|
|
if (fail)
|
|
{
|
|
if (ftz || relaxedMode)
|
|
{
|
|
// retry per section 6.5.3.2
|
|
if (IsDoubleResultSubnormal(correct, ulps))
|
|
{
|
|
fail = fail && (test != 0.0f);
|
|
if (!fail) err = 0.0f;
|
|
}
|
|
|
|
// retry per section 6.5.3.3
|
|
if (IsDoubleSubnormal(s[j]))
|
|
{
|
|
long double correct2 = func.f_f(0.0L);
|
|
long double correct3 = func.f_f(-0.0L);
|
|
float err2 =
|
|
Bruteforce_Ulp_Error_Double(test, correct2);
|
|
float err3 =
|
|
Bruteforce_Ulp_Error_Double(test, correct3);
|
|
fail = fail
|
|
&& ((!(fabsf(err2) <= ulps))
|
|
&& (!(fabsf(err3) <= ulps)));
|
|
if (fabsf(err2) < fabsf(err)) err = err2;
|
|
if (fabsf(err3) < fabsf(err)) err = err3;
|
|
|
|
// retry per section 6.5.3.4
|
|
if (IsDoubleResultSubnormal(correct2, ulps)
|
|
|| IsDoubleResultSubnormal(correct3, ulps))
|
|
{
|
|
fail = fail && (test != 0.0f);
|
|
if (!fail) err = 0.0f;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (fabsf(err) > tinfo->maxError)
|
|
{
|
|
tinfo->maxError = fabsf(err);
|
|
tinfo->maxErrorValue = s[j];
|
|
}
|
|
if (fail)
|
|
{
|
|
vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
|
|
"(0x%16.16" PRIx64 "): *%.13la vs. %.13la\n",
|
|
job->f->name, sizeNames[k], err,
|
|
((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
|
|
((cl_double *)gOut_Ref)[j], test);
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
|
{
|
|
if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
|
|
out[j], 0, NULL, NULL)))
|
|
{
|
|
vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
|
|
j, error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
|
|
|
|
|
|
if (0 == (base & 0x0fffffff))
|
|
{
|
|
if (gVerboseBruteForce)
|
|
{
|
|
vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
|
|
"ThreadCount:%2u\n",
|
|
base, job->step, buffer_elements, job->scale, job->ulps,
|
|
job->threadCount);
|
|
}
|
|
else
|
|
{
|
|
vlog(".");
|
|
}
|
|
fflush(stdout);
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
|
|
{
|
|
TestInfo test_info{};
|
|
cl_int error;
|
|
float maxError = 0.0f;
|
|
double maxErrorVal = 0.0;
|
|
|
|
logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
|
|
// Init test_info
|
|
test_info.threadCount = GetThreadCount();
|
|
test_info.subBufferSize = BUFFER_SIZE
|
|
/ (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
|
|
test_info.scale = getTestScale(sizeof(cl_double));
|
|
|
|
test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
|
|
if (test_info.step / test_info.subBufferSize != test_info.scale)
|
|
{
|
|
// there was overflow
|
|
test_info.jobCount = 1;
|
|
}
|
|
else
|
|
{
|
|
test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
|
|
}
|
|
|
|
test_info.f = f;
|
|
test_info.ulps = f->double_ulps;
|
|
test_info.ftz = f->ftz || gForceFTZ;
|
|
test_info.relaxedMode = relaxedMode;
|
|
|
|
test_info.tinfo.resize(test_info.threadCount);
|
|
for (cl_uint i = 0; i < test_info.threadCount; i++)
|
|
{
|
|
cl_buffer_region region = {
|
|
i * test_info.subBufferSize * sizeof(cl_double),
|
|
test_info.subBufferSize * sizeof(cl_double)
|
|
};
|
|
test_info.tinfo[i].inBuf =
|
|
clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
|
|
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
|
|
if (error || NULL == test_info.tinfo[i].inBuf)
|
|
{
|
|
vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
|
|
"region {%zd, %zd}\n",
|
|
region.origin, region.size);
|
|
return error;
|
|
}
|
|
|
|
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
|
|
{
|
|
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
|
|
gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
|
|
®ion, &error);
|
|
if (error || NULL == test_info.tinfo[i].outBuf[j])
|
|
{
|
|
vlog_error("Error: Unable to create sub-buffer of "
|
|
"gOutBuffer[%d] for region {%zd, %zd}\n",
|
|
(int)j, region.origin, region.size);
|
|
return error;
|
|
}
|
|
}
|
|
test_info.tinfo[i].tQueue =
|
|
clCreateCommandQueue(gContext, gDevice, 0, &error);
|
|
if (NULL == test_info.tinfo[i].tQueue || error)
|
|
{
|
|
vlog_error("clCreateCommandQueue failed. (%d)\n", error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
// Init the kernels
|
|
BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
|
|
test_info.programs, f->nameInCode,
|
|
relaxedMode };
|
|
if ((error = ThreadPool_Do(BuildKernelFn,
|
|
gMaxVectorSizeIndex - gMinVectorSizeIndex,
|
|
&build_info)))
|
|
return error;
|
|
|
|
// Run the kernels
|
|
if (!gSkipCorrectnessTesting)
|
|
{
|
|
error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
|
|
if (error) return error;
|
|
|
|
// Accumulate the arithmetic errors
|
|
for (cl_uint i = 0; i < test_info.threadCount; i++)
|
|
{
|
|
if (test_info.tinfo[i].maxError > maxError)
|
|
{
|
|
maxError = test_info.tinfo[i].maxError;
|
|
maxErrorVal = test_info.tinfo[i].maxErrorValue;
|
|
}
|
|
}
|
|
|
|
if (gWimpyMode)
|
|
vlog("Wimp pass");
|
|
else
|
|
vlog("passed");
|
|
|
|
vlog("\t%8.2f @ %a", maxError, maxErrorVal);
|
|
}
|
|
|
|
vlog("\n");
|
|
|
|
return CL_SUCCESS;
|
|
}
|