Fix ODR violations in math_brute_force (#1255)

A program having a type (such as ThreadInfo) defined differently in
multiple translation units exhibits undefined behaviour.

This commit fixes such issues in the math_brute_force component by
ensuring most types are local to their translation unit with the help of
anonymous namespaces. Later refactoring will be able to extract common
definitions to a single place.

This patch also removes unnecessary static and typedef keywords.
Otherwise, code is only moved around with no change.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
This commit is contained in:
Marco Antognini
2021-05-21 10:07:54 +01:00
committed by GitHub
parent ce1687a408
commit ba9312e4a2
27 changed files with 2400 additions and 2321 deletions

View File

@@ -20,10 +20,12 @@
#include <cstring>
namespace {
const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
cl_kernel *k, cl_program *p, bool relaxedMode)
int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
cl_kernel *k, cl_program *p, bool relaxedMode)
{
const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void math_kernel",
@@ -109,7 +111,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
relaxedMode);
}
typedef struct BuildKernelInfo
struct BuildKernelInfo
{
cl_uint offset; // the first vector size to build
cl_uint kernel_count;
@@ -117,9 +119,9 @@ typedef struct BuildKernelInfo
cl_program *programs;
const char *nameInCode;
bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
} BuildKernelInfo;
};
static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
{
BuildKernelInfo *info = (BuildKernelInfo *)p;
cl_uint i = info->offset + job_id;
@@ -128,7 +130,7 @@ static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
}
// Thread specific data for a worker thread
typedef struct ThreadInfo
struct ThreadInfo
{
cl_mem inBuf; // input buffer for the thread
cl_mem inBuf2; // input buffer for the thread
@@ -140,9 +142,9 @@ typedef struct ThreadInfo
// to 0.
MTdata d;
cl_command_queue tQueue; // per thread command queue to improve performance
} ThreadInfo;
};
typedef struct TestInfo
struct TestInfo
{
size_t subBufferSize; // Size of the sub-buffer in elements
const Func *f; // A pointer to the function info
@@ -164,10 +166,10 @@ typedef struct TestInfo
int isNextafter;
bool relaxedMode; // True if test is running in relaxed mode, false
// otherwise.
} TestInfo;
};
// A table of more difficult cases to get right
static const double specialValues[] = {
const double specialValues[] = {
-NAN,
-INFINITY,
-DBL_MAX,
@@ -277,195 +279,10 @@ static const double specialValues[] = {
+0.0,
};
static size_t specialValuesCount =
constexpr size_t specialValuesCount =
sizeof(specialValues) / sizeof(specialValues[0]);
static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data);
int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
{
TestInfo test_info;
cl_int error;
float maxError = 0.0f;
double maxErrorVal = 0.0;
double maxErrorVal2 = 0.0;
logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
// Init test_info
memset(&test_info, 0, sizeof(test_info));
test_info.threadCount = GetThreadCount();
test_info.subBufferSize = BUFFER_SIZE
/ (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
test_info.scale = getTestScale(sizeof(cl_double));
test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
if (test_info.step / test_info.subBufferSize != test_info.scale)
{
// there was overflow
test_info.jobCount = 1;
}
else
{
test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
}
test_info.f = f;
test_info.ulps = f->double_ulps;
test_info.ftz = f->ftz || gForceFTZ;
test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
test_info.skipNanInf = 0;
test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
// cl_kernels aren't thread safe, so we make one for each vector size for
// every thread
for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
{
size_t array_size = test_info.threadCount * sizeof(cl_kernel);
test_info.k[i] = (cl_kernel *)malloc(array_size);
if (NULL == test_info.k[i])
{
vlog_error("Error: Unable to allocate storage for kernels!\n");
error = CL_OUT_OF_HOST_MEMORY;
goto exit;
}
memset(test_info.k[i], 0, array_size);
}
test_info.tinfo =
(ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
if (NULL == test_info.tinfo)
{
vlog_error(
"Error: Unable to allocate storage for thread specific data.\n");
error = CL_OUT_OF_HOST_MEMORY;
goto exit;
}
memset(test_info.tinfo, 0,
test_info.threadCount * sizeof(*test_info.tinfo));
for (cl_uint i = 0; i < test_info.threadCount; i++)
{
cl_buffer_region region = {
i * test_info.subBufferSize * sizeof(cl_double),
test_info.subBufferSize * sizeof(cl_double)
};
test_info.tinfo[i].inBuf =
clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
if (error || NULL == test_info.tinfo[i].inBuf)
{
vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
"region {%zd, %zd}\n",
region.origin, region.size);
goto exit;
}
test_info.tinfo[i].inBuf2 =
clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
if (error || NULL == test_info.tinfo[i].inBuf2)
{
vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
"region {%zd, %zd}\n",
region.origin, region.size);
goto exit;
}
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
&region, &error);
if (error || NULL == test_info.tinfo[i].outBuf[j])
{
vlog_error("Error: Unable to create sub-buffer of "
"gOutBuffer[%d] for region {%zd, %zd}\n",
(int)j, region.origin, region.size);
goto exit;
}
}
test_info.tinfo[i].tQueue =
clCreateCommandQueue(gContext, gDevice, 0, &error);
if (NULL == test_info.tinfo[i].tQueue || error)
{
vlog_error("clCreateCommandQueue failed. (%d)\n", error);
goto exit;
}
test_info.tinfo[i].d = init_genrand(genrand_int32(d));
}
// Init the kernels
{
BuildKernelInfo build_info = {
gMinVectorSizeIndex, test_info.threadCount, test_info.k,
test_info.programs, f->nameInCode, relaxedMode
};
if ((error = ThreadPool_Do(BuildKernelFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
goto exit;
}
// Run the kernels
if (!gSkipCorrectnessTesting)
{
error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
// Accumulate the arithmetic errors
for (cl_uint i = 0; i < test_info.threadCount; i++)
{
if (test_info.tinfo[i].maxError > maxError)
{
maxError = test_info.tinfo[i].maxError;
maxErrorVal = test_info.tinfo[i].maxErrorValue;
maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
}
}
if (error) goto exit;
if (gWimpyMode)
vlog("Wimp pass");
else
vlog("passed");
vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
}
vlog("\n");
exit:
// Release
for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
{
clReleaseProgram(test_info.programs[i]);
if (test_info.k[i])
{
for (cl_uint j = 0; j < test_info.threadCount; j++)
clReleaseKernel(test_info.k[i][j]);
free(test_info.k[i]);
}
}
if (test_info.tinfo)
{
for (cl_uint i = 0; i < test_info.threadCount; i++)
{
free_mtdata(test_info.tinfo[i].d);
clReleaseMemObject(test_info.tinfo[i].inBuf);
clReleaseMemObject(test_info.tinfo[i].inBuf2);
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
clReleaseCommandQueue(test_info.tinfo[i].tQueue);
}
free(test_info.tinfo);
}
return error;
}
static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
{
const TestInfo *job = (const TestInfo *)data;
size_t buffer_elements = job->subBufferSize;
@@ -825,3 +642,188 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
exit:
return error;
}
} // anonymous namespace
int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
{
TestInfo test_info;
cl_int error;
float maxError = 0.0f;
double maxErrorVal = 0.0;
double maxErrorVal2 = 0.0;
logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
// Init test_info
memset(&test_info, 0, sizeof(test_info));
test_info.threadCount = GetThreadCount();
test_info.subBufferSize = BUFFER_SIZE
/ (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
test_info.scale = getTestScale(sizeof(cl_double));
test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
if (test_info.step / test_info.subBufferSize != test_info.scale)
{
// there was overflow
test_info.jobCount = 1;
}
else
{
test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
}
test_info.f = f;
test_info.ulps = f->double_ulps;
test_info.ftz = f->ftz || gForceFTZ;
test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
test_info.skipNanInf = 0;
test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
// cl_kernels aren't thread safe, so we make one for each vector size for
// every thread
for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
{
size_t array_size = test_info.threadCount * sizeof(cl_kernel);
test_info.k[i] = (cl_kernel *)malloc(array_size);
if (NULL == test_info.k[i])
{
vlog_error("Error: Unable to allocate storage for kernels!\n");
error = CL_OUT_OF_HOST_MEMORY;
goto exit;
}
memset(test_info.k[i], 0, array_size);
}
test_info.tinfo =
(ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
if (NULL == test_info.tinfo)
{
vlog_error(
"Error: Unable to allocate storage for thread specific data.\n");
error = CL_OUT_OF_HOST_MEMORY;
goto exit;
}
memset(test_info.tinfo, 0,
test_info.threadCount * sizeof(*test_info.tinfo));
for (cl_uint i = 0; i < test_info.threadCount; i++)
{
cl_buffer_region region = {
i * test_info.subBufferSize * sizeof(cl_double),
test_info.subBufferSize * sizeof(cl_double)
};
test_info.tinfo[i].inBuf =
clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
if (error || NULL == test_info.tinfo[i].inBuf)
{
vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
"region {%zd, %zd}\n",
region.origin, region.size);
goto exit;
}
test_info.tinfo[i].inBuf2 =
clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
if (error || NULL == test_info.tinfo[i].inBuf2)
{
vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
"region {%zd, %zd}\n",
region.origin, region.size);
goto exit;
}
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
{
test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
&region, &error);
if (error || NULL == test_info.tinfo[i].outBuf[j])
{
vlog_error("Error: Unable to create sub-buffer of "
"gOutBuffer[%d] for region {%zd, %zd}\n",
(int)j, region.origin, region.size);
goto exit;
}
}
test_info.tinfo[i].tQueue =
clCreateCommandQueue(gContext, gDevice, 0, &error);
if (NULL == test_info.tinfo[i].tQueue || error)
{
vlog_error("clCreateCommandQueue failed. (%d)\n", error);
goto exit;
}
test_info.tinfo[i].d = init_genrand(genrand_int32(d));
}
// Init the kernels
{
BuildKernelInfo build_info = {
gMinVectorSizeIndex, test_info.threadCount, test_info.k,
test_info.programs, f->nameInCode, relaxedMode
};
if ((error = ThreadPool_Do(BuildKernelFn,
gMaxVectorSizeIndex - gMinVectorSizeIndex,
&build_info)))
goto exit;
}
// Run the kernels
if (!gSkipCorrectnessTesting)
{
error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
// Accumulate the arithmetic errors
for (cl_uint i = 0; i < test_info.threadCount; i++)
{
if (test_info.tinfo[i].maxError > maxError)
{
maxError = test_info.tinfo[i].maxError;
maxErrorVal = test_info.tinfo[i].maxErrorValue;
maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
}
}
if (error) goto exit;
if (gWimpyMode)
vlog("Wimp pass");
else
vlog("passed");
vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
}
vlog("\n");
exit:
// Release
for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
{
clReleaseProgram(test_info.programs[i]);
if (test_info.k[i])
{
for (cl_uint j = 0; j < test_info.threadCount; j++)
clReleaseKernel(test_info.k[i][j]);
free(test_info.k[i]);
}
}
if (test_info.tinfo)
{
for (cl_uint i = 0; i < test_info.threadCount; i++)
{
free_mtdata(test_info.tinfo[i].d);
clReleaseMemObject(test_info.tinfo[i].inBuf);
clReleaseMemObject(test_info.tinfo[i].inBuf2);
for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
clReleaseCommandQueue(test_info.tinfo[i].tQueue);
}
free(test_info.tinfo);
}
return error;
}