diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index 957233ca..96433945 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -1,23 +1,36 @@ set(MODULE_NAME BRUTEFORCE) set(${MODULE_NAME}_SOURCES - binary.cpp - binary_i.cpp - binary_operator.cpp - binary_two_results_i.cpp + binary_double.cpp + binary_float.cpp + binary_i_double.cpp + binary_i_float.cpp + binary_operator_double.cpp + binary_operator_float.cpp + binary_two_results_i_double.cpp + binary_two_results_i_float.cpp function_list.cpp - i_unary.cpp - macro_binary.cpp - macro_unary.cpp - mad.cpp + i_unary_double.cpp + i_unary_float.cpp + macro_binary_double.cpp + macro_binary_float.cpp + macro_unary_double.cpp + macro_unary_float.cpp + mad_double.cpp + mad_float.cpp main.cpp reference_math.cpp sleep.cpp - ternary.cpp - unary.cpp - unary_two_results.cpp - unary_two_results_i.cpp - unary_u.cpp + ternary_double.cpp + ternary_float.cpp + unary_double.cpp + unary_float.cpp + unary_two_results_double.cpp + unary_two_results_float.cpp + unary_two_results_i_double.cpp + unary_two_results_i_float.cpp + unary_u_double.cpp + unary_u_float.cpp utility.cpp ) diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp new file mode 100644 index 00000000..7bff9aca --- /dev/null +++ b/test_conformance/math_brute_force/binary_double.cpp @@ -0,0 +1,947 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022); + +static int BuildKernelDouble(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global double* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " d0 = ", + name, + "( d0, d1 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0;\n" + " double3 d1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, d1 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + double maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isFDim; + int skipNanInf; + int isNextafter; + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. +} TestInfo; + +// A table of more difficult cases to get right +static const double specialValuesDouble[] = { + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), + MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), + -1000., + -100., + -4.0, + -3.5, + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), + -0.5, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), + -0.25, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, + + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), + MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), + +1000., + +100., + +4.0, + +3.5, + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), + +0.5, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), + +0.25, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, +}; + +static size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); + +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); + +static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, + int isNextafter, + bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); + } + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->double_ulps; + test_info.ftz = f->ftz || gForceFTZ; + + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = 0; + test_info.isNextafter = isNextafter; + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { + gMinVectorSizeIndex, test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, relaxedMode + }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input arrays + double *p = (double *)gIn; + double *p2 = (double *)gIn2; + for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) + { + p[j] = DoubleFromUInt32(genrand_int32(d)); + p2[j] = DoubleFromUInt32(genrand_int32(d)); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (i = 0; i < PERF_LOOP_COUNT; i++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + free_mtdata(test_info.tinfo[i].d); + clReleaseMemObject(test_info.tinfo[i].inBuf); + clReleaseMemObject(test_info.tinfo[i].inBuf2); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + dptr func = job->f->dfunc; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + + int isNextafter = job->isNextafter; + cl_ulong *t; + cl_double *r; + cl_double *s; + cl_double *s2; + + Force64BitFPUPrecision(); + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ulong *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Init input array + cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements; + cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesDoubleCount * specialValuesDoubleCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + cl_double *fp = (cl_double *)p; + cl_double *fp2 = (cl_double *)p2; + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesDoubleCount; + y = (job_id * buffer_elements) / specialValuesDoubleCount; + + for (; j < buffer_elements; j++) + { + fp[j] = specialValuesDouble[x]; + fp2[j] = specialValuesDouble[y]; + if (++x >= specialValuesDoubleCount) + { + x = 0; + y++; + if (y >= specialValuesDoubleCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = genrand_int64(d); + p2[j] = genrand_int64(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + goto exit; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + goto exit; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xffffdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + goto exit; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_double *)gOut_Ref + thread_id * buffer_elements; + s = (cl_double *)gIn + thread_id * buffer_elements; + s2 = (cl_double *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + r[j] = (cl_double)func.f_ff(s[j], s2[j]); + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + + // Wait for the last buffer + out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + t = (cl_ulong *)r; + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ulong *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + cl_double test = ((cl_double *)q)[j]; + long double correct = func.f_ff(s[j], s2[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsDoubleResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // nextafter on FTZ platforms may return the smallest + // normal float (2^-126) given a denormal or a zero + // as the first argument. The rationale here is that + // nextafter flushes the argument to zero and then + // returns the next representable number in the + // direction of the second argument, and since + // denorms are considered as zero, the smallest + // normal number is the next representable number. + // In which case, it should have the same sign as the + // second argument. + if (isNextafter) + { + if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f) + { + cl_double value = copysign(twoToMinus1022, s2[j]); + fail = fail && (test != value); + if (!fail) err = 0.0f; + } + } + else + { + // retry per section 6.5.3.3 + if (IsDoubleSubnormal(s[j])) + { + long double correct2 = func.f_ff(0.0, s2[j]); + long double correct3 = func.f_ff(-0.0, s2[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsDoubleSubnormal(s2[j])) + { + correct2 = func.f_ff(0.0, 0.0); + correct3 = func.f_ff(-0.0, 0.0); + long double correct4 = func.f_ff(0.0, -0.0); + long double correct5 = func.f_ff(-0.0, -0.0); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps) + || IsDoubleResultSubnormal(correct4, ulps) + || IsDoubleResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (IsDoubleSubnormal(s2[j])) + { + long double correct2 = func.f_ff(s[j], 0.0); + long double correct3 = func.f_ff(s[j], -0.0); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, " + "%.13la}: *%.13la vs. %.13la\n", + name, sizeNames[k], err, s[j], s2[j], r[j], + test); + error = -1; + goto exit; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + +exit: + return error; +} + +int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode) +{ + return TestFunc_Double_Double_Double_common(f, d, 0, relaxedMode); +} + +int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d, + bool relaxedMode) +{ + return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode); +} diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary_float.cpp similarity index 55% rename from test_conformance/math_brute_force/binary.cpp rename to test_conformance/math_brute_force/binary_float.cpp index 699c0944..0ad7b87a 100644 --- a/test_conformance/math_brute_force/binary.cpp +++ b/test_conformance/math_brute_force/binary_float.cpp @@ -21,7 +21,6 @@ #include const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126); -const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022); static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) @@ -108,94 +107,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -static int BuildKernelDouble(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* in1, __global double", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global double* in, __global double* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " d0 = ", - name, - "( d0, d1 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 d0;\n" - " double3 d1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, d1 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - typedef struct BuildKernelInfo { cl_uint offset; // the first vector size to build @@ -215,16 +126,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} - // A table of more difficult cases to get right static const float specialValuesFloat[] = { -NAN, @@ -1194,790 +1095,13 @@ exit: return error; } -// A table of more difficult cases to get right -static const double specialValuesDouble[] = { - -NAN, - -INFINITY, - -DBL_MAX, - MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), - MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), - MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), - MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), - MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), - MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), - MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), - -1000., - -100., - -4.0, - -3.5, - -3.0, - MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), - -2.5, - MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), - -2.0, - MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), - -1.5, - MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), - MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), - -1.0, - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), - -0.5, - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), - -0.25, - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), - -DBL_MIN, - MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), - MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), - -0.0, - - +NAN, - +INFINITY, - +DBL_MAX, - MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), - MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), - MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), - MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), - MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), - MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), - MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), - +1000., - +100., - +4.0, - +3.5, - +3.0, - MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), - +2.5, - MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), - +2.0, - MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), - +1.5, - MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), - MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), - +1.0, - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), - +0.5, - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), - +0.25, - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), - +DBL_MIN, - MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), - MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), - +0.0, -}; - -static size_t specialValuesDoubleCount = - sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); - -static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); - -static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, - int isNextafter, - bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - if (gWimpyMode) - { - test_info.subBufferSize = gWimpyBufferSize - / (sizeof(cl_double) - * RoundUpToNextPowerOfTwo(test_info.threadCount)); - } - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->double_ulps; - test_info.ftz = f->ftz || gForceFTZ; - - test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); - test_info.skipNanInf = 0; - test_info.isNextafter = isNextafter; - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input arrays - double *p = (double *)gIn; - double *p2 = (double *)gIn2; - for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) - { - p[j] = DoubleFromUInt32(genrand_int32(d)); - p2[j] = DoubleFromUInt32(genrand_int32(d)); - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - BUFFER_SIZE, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) - / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if ((error = clSetKernelArg(test_info.k[j][0], 0, - sizeof(gOutBuffer[j]), &gOutBuffer[j]))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 2, - sizeof(gInBuffer2), &gInBuffer2))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (i = 0; i < PERF_LOOP_COUNT; i++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], - 1, NULL, &localCount, NULL, - 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (BUFFER_SIZE / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - if (!gSkipCorrectnessTesting) - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - vlog("\n"); - -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof(cl_double); - cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - dptr func = job->f->dfunc; - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - - int isNextafter = job->isNextafter; - cl_ulong *t; - cl_double *r; - cl_double *s; - cl_double *s2; - - Force64BitFPUPrecision(); - - // start the map of the output arrays - cl_event e[VECTOR_SIZE_COUNT]; - cl_ulong *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_ulong *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - - // Init input array - cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements; - cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements; - j = 0; - int totalSpecialValueCount = - specialValuesDoubleCount * specialValuesDoubleCount; - int indx = (totalSpecialValueCount - 1) / buffer_elements; - - if (job_id <= (cl_uint)indx) - { // test edge cases - cl_double *fp = (cl_double *)p; - cl_double *fp2 = (cl_double *)p2; - uint32_t x, y; - - x = (job_id * buffer_elements) % specialValuesDoubleCount; - y = (job_id * buffer_elements) / specialValuesDoubleCount; - - for (; j < buffer_elements; j++) - { - fp[j] = specialValuesDouble[x]; - fp2[j] = specialValuesDouble[y]; - if (++x >= specialValuesDoubleCount) - { - x = 0; - y++; - if (y >= specialValuesDoubleCount) break; - } - } - } - - // Init any remaining values. - for (; j < buffer_elements; j++) - { - p[j] = genrand_int64(d); - p2[j] = genrand_int64(d); - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, - buffer_size, p, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, - buffer_size, p2, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - goto exit; - } - if ((error = clReleaseEvent(e[j]))) - { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - goto exit; - } - - // Fill the result buffer with garbage, so that old results don't carry - // over - uint32_t pattern = 0xffffdead; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - goto exit; - } - - // run the kernel - size_t vectorCount = - (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its - // own copy of the cl_kernel - cl_program program = job->programs[j]; - - if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), - &tinfo->outBuf[j]))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), - &tinfo->inBuf))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), - &tinfo->inBuf2))) - { - LogBuildError(program); - return error; - } - - if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, - &vectorCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - - if (gSkipCorrectnessTesting) return CL_SUCCESS; - - // Calculate the correctly rounded reference result - r = (cl_double *)gOut_Ref + thread_id * buffer_elements; - s = (cl_double *)gIn + thread_id * buffer_elements; - s2 = (cl_double *)gIn2 + thread_id * buffer_elements; - for (j = 0; j < buffer_elements; j++) - r[j] = (cl_double)func.f_ff(s[j], s2[j]); - - // Read the data back -- no need to wait for the first N-1 buffers. This is - // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_ulong *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - goto exit; - } - } - - // Wait for the last buffer - out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], - CL_TRUE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; - } - - // Verify data - t = (cl_ulong *)r; - for (j = 0; j < buffer_elements; j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - cl_ulong *q = out[k]; - - // If we aren't getting the correctly rounded result - if (t[j] != q[j]) - { - cl_double test = ((cl_double *)q)[j]; - long double correct = func.f_ff(s[j], s2[j]); - float err = Bruteforce_Ulp_Error_Double(test, correct); - int fail = !(fabsf(err) <= ulps); - - if (fail && ftz) - { - // retry per section 6.5.3.2 - if (IsDoubleResultSubnormal(correct, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // nextafter on FTZ platforms may return the smallest - // normal float (2^-126) given a denormal or a zero - // as the first argument. The rationale here is that - // nextafter flushes the argument to zero and then - // returns the next representable number in the - // direction of the second argument, and since - // denorms are considered as zero, the smallest - // normal number is the next representable number. - // In which case, it should have the same sign as the - // second argument. - if (isNextafter) - { - if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f) - { - cl_double value = copysign(twoToMinus1022, s2[j]); - fail = fail && (test != value); - if (!fail) err = 0.0f; - } - } - else - { - // retry per section 6.5.3.3 - if (IsDoubleSubnormal(s[j])) - { - long double correct2 = func.f_ff(0.0, s2[j]); - long double correct3 = func.f_ff(-0.0, s2[j]); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, ulps) - || IsDoubleResultSubnormal(correct3, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // try with both args as zero - if (IsDoubleSubnormal(s2[j])) - { - correct2 = func.f_ff(0.0, 0.0); - correct3 = func.f_ff(-0.0, 0.0); - long double correct4 = func.f_ff(0.0, -0.0); - long double correct5 = func.f_ff(-0.0, -0.0); - err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - float err4 = - Bruteforce_Ulp_Error_Double(test, correct4); - float err5 = - Bruteforce_Ulp_Error_Double(test, correct5); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps)) - && (!(fabsf(err4) <= ulps)) - && (!(fabsf(err5) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, ulps) - || IsDoubleResultSubnormal(correct3, ulps) - || IsDoubleResultSubnormal(correct4, ulps) - || IsDoubleResultSubnormal(correct5, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - else if (IsDoubleSubnormal(s2[j])) - { - long double correct2 = func.f_ff(s[j], 0.0); - long double correct3 = func.f_ff(s[j], -0.0); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, ulps) - || IsDoubleResultSubnormal(correct3, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - } - - if (fabsf(err) > tinfo->maxError) - { - tinfo->maxError = fabsf(err); - tinfo->maxErrorValue = s[j]; - tinfo->maxErrorValue2 = s2[j]; - } - if (fail) - { - vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, " - "%.13la}: *%.13la vs. %.13la\n", - name, sizeNames[k], err, s[j], s2[j], r[j], - test); - error = -1; - goto exit; - } - } - } - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", - j, error); - return error; - } - } - - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - - - if (0 == (base & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " - "ThreadCount:%2u\n", - base, job->step, job->scale, buffer_elements, job->ulps, - job->threadCount); - } - else - { - vlog("."); - } - fflush(stdout); - } - -exit: - return error; -} - int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Float_Float_Float_common(f, d, 0, relaxedMode); } -int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode) -{ - return TestFunc_Double_Double_Double_common(f, d, 0, relaxedMode); -} - int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Float_Float_Float_common(f, d, 1, relaxedMode); } - -int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d, - bool relaxedMode) -{ - return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode); -} diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i_double.cpp similarity index 54% rename from test_conformance/math_brute_force/binary_i.cpp rename to test_conformance/math_brute_force/binary_i_double.cpp index 50d14f33..4d6cb860 100644 --- a/test_conformance/math_brute_force/binary_i.cpp +++ b/test_conformance/math_brute_force/binary_i_double.cpp @@ -21,91 +21,6 @@ #include #include -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float", - sizeNames[vectorSize], - "* out, __global float", - sizeNames[vectorSize], - "* in1, __global int", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i] );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float* out, __global float* in, __global int* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = vload3( 0, in2 + 3 * i );\n" - " f0 = ", - name, - "( f0, i0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " float3 f0;\n" - " int3 i0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0, i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - static int BuildKernelDouble(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) @@ -204,15 +119,6 @@ typedef struct BuildKernelInfo bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. } BuildKernelInfo; -static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); -} - static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -223,112 +129,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, info->relaxedMode); } -// A table of more difficult cases to get right -static const float specialValuesFloat[] = { - -NAN, - -INFINITY, - -FLT_MAX, - MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), - MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), - MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), - MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), - MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), - MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), - MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), - MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), - MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), - MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), - MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), - MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), - -1000.f, - -100.f, - -4.0f, - -3.5f, - -3.0f, - MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), - -2.5f, - MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), - -2.0f, - MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), - -1.5f, - MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), - MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), - -1.0f, - MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), - MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), - -0.5f, - MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), - MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), - -0.25f, - MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), - MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), - -FLT_MIN, - MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), - MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), - MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), - MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), - MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), - MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), - MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), - MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), - MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), - MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), - -0.0f, - - +NAN, - +INFINITY, - +FLT_MAX, - MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), - MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), - MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), - MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), - MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), - MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), - MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), - MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), - MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), - MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), - MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), - MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), - +1000.f, - +100.f, - +4.0f, - +3.5f, - +3.0f, - MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), - 2.5f, - MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), - +2.0f, - MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), - 1.5f, - MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), - MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), - +1.0f, - MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), - MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), - +0.5f, - MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), - MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), - +0.25f, - MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), - MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), - +FLT_MIN, - MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), - MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), - MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), - MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), - MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), - MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), - MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), - MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), - MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), - MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), - +0.0f -}; - -static const size_t specialValuesFloatCount = - sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); - static const int specialValuesInt[] = { 0, 1, 2, 3, 126, 127, 128, 0x02000001, 0x04000001, 1465264071, 1488522147, -1, @@ -373,576 +173,6 @@ typedef struct TestInfo // no special values } TestInfo; -static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); - -int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - cl_int maxErrorVal2 = 0; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - if (gWimpyMode) - { - test_info.subBufferSize = gWimpyBufferSize - / (sizeof(cl_float) - * RoundUpToNextPowerOfTwo(test_info.threadCount)); - } - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - cl_buffer_region region2 = { i * test_info.subBufferSize - * sizeof(cl_int), - test_info.subBufferSize * sizeof(cl_int) }; - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gInBuffer for region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernel_FloatFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input arrays - uint32_t *p = (uint32_t *)gIn; - uint32_t *p2 = (uint32_t *)gIn2; - for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) - { - p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000; - p2[j] = 3; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - BUFFER_SIZE, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_float) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) - / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if ((error = clSetKernelArg(test_info.k[j][0], 0, - sizeof(gOutBuffer[j]), &gOutBuffer[j]))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 2, - sizeof(gInBuffer2), &gInBuffer2))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (i = 0; i < PERF_LOOP_COUNT; i++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], - 1, NULL, &localCount, NULL, - 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (BUFFER_SIZE / sizeof(float)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", - f->name, sizeNames[j]); - } - } - - if (!gSkipCorrectnessTesting) - vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); - vlog("\n"); - -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof(cl_float); - cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - fptr func = job->f->func; - int ftz = job->ftz; - float ulps = job->ulps; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_uint *t = 0; - cl_float *r = 0; - cl_float *s = 0; - cl_int *s2 = 0; - - // start the map of the output arrays - cl_event e[VECTOR_SIZE_COUNT]; - cl_uint *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_uint *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - - // Init input array - cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; - cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; - j = 0; - - int totalSpecialValueCount = - specialValuesFloatCount * specialValuesIntCount; - int indx = (totalSpecialValueCount - 1) / buffer_elements; - - if (job_id <= (cl_uint)indx) - { // test edge cases - float *fp = (float *)p; - cl_int *ip2 = (cl_int *)p2; - uint32_t x, y; - - x = (job_id * buffer_elements) % specialValuesFloatCount; - y = (job_id * buffer_elements) / specialValuesFloatCount; - - for (; j < buffer_elements; j++) - { - fp[j] = specialValuesFloat[x]; - ip2[j] = specialValuesInt[y]; - ++x; - if (x >= specialValuesFloatCount) - { - x = 0; - y++; - if (y >= specialValuesIntCount) break; - } - } - } - - // Init any remaining values. - for (; j < buffer_elements; j++) - { - p[j] = genrand_int32(d); - p2[j] = genrand_int32(d); - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, - buffer_size, p, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, - buffer_size, p2, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - goto exit; - } - if ((error = clReleaseEvent(e[j]))) - { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - goto exit; - } - - // Fill the result buffer with garbage, so that old results don't carry - // over - uint32_t pattern = 0xffffdead; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - goto exit; - } - - // run the kernel - size_t vectorCount = - (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its - // own copy of the cl_kernel - cl_program program = job->programs[j]; - - if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), - &tinfo->outBuf[j]))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), - &tinfo->inBuf))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), - &tinfo->inBuf2))) - { - LogBuildError(program); - return error; - } - - if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, - &vectorCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - - if (gSkipCorrectnessTesting) return CL_SUCCESS; - - // Calculate the correctly rounded reference result - r = (float *)gOut_Ref + thread_id * buffer_elements; - s = (float *)gIn + thread_id * buffer_elements; - s2 = (cl_int *)gIn2 + thread_id * buffer_elements; - for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]); - - // Read the data back -- no need to wait for the first N-1 buffers. This is - // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_uint *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - goto exit; - } - } - - // Wait for the last buffer - out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], - CL_TRUE, CL_MAP_READ, 0, buffer_size, - 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; - } - - // Verify data - t = (cl_uint *)r; - for (j = 0; j < buffer_elements; j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - cl_uint *q = out[k]; - - // If we aren't getting the correctly rounded result - if (t[j] != q[j]) - { - float test = ((float *)q)[j]; - double correct = func.f_fi(s[j], s2[j]); - float err = Ulp_Error(test, correct); - int fail = !(fabsf(err) <= ulps); - - if (fail && ftz) - { - // retry per section 6.5.3.2 - if (IsFloatResultSubnormal(correct, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // retry per section 6.5.3.3 - if (IsFloatSubnormal(s[j])) - { - double correct2, correct3; - float err2, err3; - correct2 = func.f_fi(0.0, s2[j]); - correct3 = func.f_fi(-0.0, s2[j]); - err2 = Ulp_Error(test, correct2); - err3 = Ulp_Error(test, correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsFloatResultSubnormal(correct2, ulps) - || IsFloatResultSubnormal(correct3, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - - if (fabsf(err) > tinfo->maxError) - { - tinfo->maxError = fabsf(err); - tinfo->maxErrorValue = s[j]; - tinfo->maxErrorValue2 = s2[j]; - } - if (fail) - { - vlog_error( - "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: " - "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n", - name, sizeNames[k], err, s[j], ((uint32_t *)s)[j], - s2[j], r[j], ((uint32_t *)r)[j], test, - ((cl_uint *)&test)[0], j); - error = -1; - goto exit; - } - } - } - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", - j, error); - return error; - } - } - - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - - - if (0 == (base & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " - "ThreadCount:%2u\n", - base, job->step, job->scale, buffer_elements, job->ulps, - job->threadCount); - } - else - { - vlog("."); - } - fflush(stdout); - } - -exit: - return error; -} - - // A table of more difficult cases to get right static const double specialValuesDouble[] = { -NAN, diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp new file mode 100644 index 00000000..0ff9b57f --- /dev/null +++ b/test_conformance/math_brute_force/binary_i_float.cpp @@ -0,0 +1,845 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) +{ + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global int", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; + + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in, __global int* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = vload3( 0, in2 + 3 * i );\n" + " f0 = ", + name, + "( f0, i0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " int3 i0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernel(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, info->relaxedMode); +} + +// A table of more difficult cases to get right +static const float specialValuesFloat[] = { + -NAN, + -INFINITY, + -FLT_MAX, + MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), + MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), + MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), + MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), + MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), + MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), + MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), + MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), + MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), + MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), + MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), + MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), + -1000.f, + -100.f, + -4.0f, + -3.5f, + -3.0f, + MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), + -2.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), + -2.0f, + MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), + -1.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), + MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), + -1.0f, + MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), + MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), + -0.5f, + MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), + MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), + -0.25f, + MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), + MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), + -FLT_MIN, + MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), + MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), + MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), + MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), + MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), + MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), + MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), + MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), + MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), + MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), + -0.0f, + + +NAN, + +INFINITY, + +FLT_MAX, + MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), + MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), + MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), + MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), + MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), + MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), + MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), + MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), + MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), + MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), + MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), + MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), + +1000.f, + +100.f, + +4.0f, + +3.5f, + +3.0f, + MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), + 2.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), + +2.0f, + MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), + 1.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), + MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), + +1.0f, + MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), + MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), + +0.5f, + MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), + MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), + +0.25f, + MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), + MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), + +FLT_MIN, + MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), + MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), + MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), + MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), + MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), + MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), + MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), + MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), + MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), + MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), + +0.0f +}; + +static const size_t specialValuesFloatCount = + sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); + +static const int specialValuesInt[] = { + 0, 1, 2, 3, 126, 127, + 128, 0x02000001, 0x04000001, 1465264071, 1488522147, -1, + -2, -3, -126, -127, -128, -0x02000001, + -0x04000001, -1465264071, -1488522147 +}; +static size_t specialValuesIntCount = + sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + cl_int maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + // no special values +} TestInfo; + +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); + +int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); + } + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gInBuffer for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { + gMinVectorSizeIndex, test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, relaxedMode + }; + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input arrays + uint32_t *p = (uint32_t *)gIn; + uint32_t *p2 = (uint32_t *)gIn2; + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) + { + p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000; + p2[j] = 3; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (i = 0; i < PERF_LOOP_COUNT; i++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); + } + } + + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + free_mtdata(test_info.tinfo[i].d); + clReleaseMemObject(test_info.tinfo[i].inBuf); + clReleaseMemObject(test_info.tinfo[i].inBuf2); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + fptr func = job->f->func; + int ftz = job->ftz; + float ulps = job->ulps; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_uint *t = 0; + cl_float *r = 0; + cl_float *s = 0; + cl_int *s2 = 0; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_uint *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_uint *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Init input array + cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; + cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; + j = 0; + + int totalSpecialValueCount = + specialValuesFloatCount * specialValuesIntCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + float *fp = (float *)p; + cl_int *ip2 = (cl_int *)p2; + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesFloatCount; + y = (job_id * buffer_elements) / specialValuesFloatCount; + + for (; j < buffer_elements; j++) + { + fp[j] = specialValuesFloat[x]; + ip2[j] = specialValuesInt[y]; + ++x; + if (x >= specialValuesFloatCount) + { + x = 0; + y++; + if (y >= specialValuesIntCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = genrand_int32(d); + p2[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + goto exit; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + goto exit; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xffffdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + goto exit; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (float *)gOut_Ref + thread_id * buffer_elements; + s = (float *)gIn + thread_id * buffer_elements; + s2 = (cl_int *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]); + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_uint *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + + // Wait for the last buffer + out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + t = (cl_uint *)r; + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_uint *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + float test = ((float *)q)[j]; + double correct = func.f_fi(s[j], s2[j]); + float err = Ulp_Error(test, correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsFloatResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsFloatSubnormal(s[j])) + { + double correct2, correct3; + float err2, err3; + correct2 = func.f_fi(0.0, s2[j]); + correct3 = func.f_fi(-0.0, s2[j]); + err2 = Ulp_Error(test, correct2); + err3 = Ulp_Error(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsFloatResultSubnormal(correct2, ulps) + || IsFloatResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: " + "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n", + name, sizeNames[k], err, s[j], ((uint32_t *)s)[j], + s2[j], r[j], ((uint32_t *)r)[j], test, + ((cl_uint *)&test)[0], j); + error = -1; + goto exit; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + +exit: + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp new file mode 100644 index 00000000..7f86afde --- /dev/null +++ b/test_conformance/math_brute_force/binary_operator_double.cpp @@ -0,0 +1,911 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernelDouble(const char *name, const char *operator_symbol, + int vectorSize, cl_uint kernel_count, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void ", + name, + "_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = in1[i] ", + operator_symbol, + " in2[i];\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void ", + name, + "_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global double* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " d0 = d0 ", + operator_symbol, + " d1;\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0;\n" + " double3 d1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = d0 ", + operator_symbol, + " d1;\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name, + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *name; + const char *operator_symbol; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->name, info->operator_symbol, i, + info->kernel_count, info->kernels[i], + info->programs + i, info->relaxedMode); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + double maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + bool relaxedMode; // True if the test is being run in relaxed mode, false + // otherwise. + + // no special fields +} TestInfo; + +// A table of more difficult cases to get right +static const double specialValuesDouble[] = { + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), + MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), + -1000., + -100., + -4.0, + -3.5, + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), + -0.5, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), + -0.25, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, + + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), + MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), + MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), + MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), + +1000., + +100., + +4.0, + +3.5, + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), + +0.5, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), + +0.25, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, +}; + +static size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); + +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); + +int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, + bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); + } + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->double_ulps; + test_info.ftz = f->ftz || gForceFTZ; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, + test_info.threadCount, + test_info.k, + test_info.programs, + f->name, + f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input arrays + double *p = (double *)gIn; + double *p2 = (double *)gIn2; + for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) + { + p[j] = DoubleFromUInt32(genrand_int32(d)); + p2[j] = DoubleFromUInt32(genrand_int32(d)); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (i = 0; i < PERF_LOOP_COUNT; i++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + vlog("\n"); + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + free_mtdata(test_info.tinfo[i].d); + clReleaseMemObject(test_info.tinfo[i].inBuf); + clReleaseMemObject(test_info.tinfo[i].inBuf2); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + dptr func = job->f->dfunc; + int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_ulong *t; + cl_double *r; + cl_double *s; + cl_double *s2; + + Force64BitFPUPrecision(); + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ulong *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Init input array + cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements; + cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesDoubleCount * specialValuesDoubleCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + cl_double *fp = (cl_double *)p; + cl_double *fp2 = (cl_double *)p2; + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesDoubleCount; + y = (job_id * buffer_elements) / specialValuesDoubleCount; + + for (; j < buffer_elements; j++) + { + fp[j] = specialValuesDouble[x]; + fp2[j] = specialValuesDouble[y]; + if (++x >= specialValuesDoubleCount) + { + x = 0; + y++; + if (y >= specialValuesDoubleCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = genrand_int64(d); + p2[j] = genrand_int64(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + goto exit; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + goto exit; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xffffdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + goto exit; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_double *)gOut_Ref + thread_id * buffer_elements; + s = (cl_double *)gIn + thread_id * buffer_elements; + s2 = (cl_double *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + r[j] = (cl_double)func.f_ff(s[j], s2[j]); + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + + // Wait for the last buffer + out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + t = (cl_ulong *)r; + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ulong *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + cl_double test = ((cl_double *)q)[j]; + long double correct = func.f_ff(s[j], s2[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsDoubleResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + + // retry per section 6.5.3.3 + if (IsDoubleSubnormal(s[j])) + { + long double correct2 = func.f_ff(0.0, s2[j]); + long double correct3 = func.f_ff(-0.0, s2[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsDoubleSubnormal(s2[j])) + { + correct2 = func.f_ff(0.0, 0.0); + correct3 = func.f_ff(-0.0, 0.0); + long double correct4 = func.f_ff(0.0, -0.0); + long double correct5 = func.f_ff(-0.0, -0.0); + err2 = Bruteforce_Ulp_Error_Double(test, correct2); + err3 = Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps) + || IsDoubleResultSubnormal(correct4, ulps) + || IsDoubleResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (IsDoubleSubnormal(s2[j])) + { + long double correct2 = func.f_ff(s[j], 0.0); + long double correct3 = func.f_ff(s[j], -0.0); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n", + name, sizeNames[k], err, s[j], s2[j], r[j], test); + error = -1; + goto exit; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + +exit: + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp similarity index 54% rename from test_conformance/math_brute_force/binary_operator.cpp rename to test_conformance/math_brute_force/binary_operator_float.cpp index 65756901..56b0280c 100644 --- a/test_conformance/math_brute_force/binary_operator.cpp +++ b/test_conformance/math_brute_force/binary_operator_float.cpp @@ -110,98 +110,6 @@ static int BuildKernel(const char *name, const char *operator_symbol, relaxedMode); } -static int BuildKernelDouble(const char *name, const char *operator_symbol, - int vectorSize, cl_uint kernel_count, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void ", - name, - "_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* in1, __global double", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = in1[i] ", - operator_symbol, - " in2[i];\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void ", - name, - "_kernel", - sizeNames[vectorSize], - "( __global double* out, __global double* in, __global double* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " d0 = d0 ", - operator_symbol, - " d1;\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 d0;\n" - " double3 d1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = d0 ", - operator_symbol, - " d1;\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name, - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - typedef struct BuildKernelInfo { cl_uint offset; // the first vector size to build @@ -222,16 +130,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->name, info->operator_symbol, i, - info->kernel_count, info->kernels[i], - info->programs + i, info->relaxedMode); -} - // A table of more difficult cases to get right static const float specialValuesFloat[] = { -NAN, @@ -1139,743 +1037,3 @@ exit: if (overflow) free(overflow); return error; } - -// A table of more difficult cases to get right -static const double specialValuesDouble[] = { - -NAN, - -INFINITY, - -DBL_MAX, - MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), - MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), - MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), - MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), - MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), - MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), - MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), - -1000., - -100., - -4.0, - -3.5, - -3.0, - MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), - -2.5, - MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), - -2.0, - MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), - -1.5, - MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), - MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), - -1.0, - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), - -0.5, - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), - -0.25, - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), - -DBL_MIN, - MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), - MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), - -0.0, - - +NAN, - +INFINITY, - +DBL_MAX, - MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), - MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), - MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), - MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), - MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), - MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), - MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), - +1000., - +100., - +4.0, - +3.5, - +3.0, - MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), - +2.5, - MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), - +2.0, - MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), - +1.5, - MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), - MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), - +1.0, - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), - +0.5, - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), - +0.25, - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), - +DBL_MIN, - MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), - MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), - +0.0, -}; - -static size_t specialValuesDoubleCount = - sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); - -static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p); - -int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, - bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - if (gWimpyMode) - { - test_info.subBufferSize = gWimpyBufferSize - / (sizeof(cl_double) - * RoundUpToNextPowerOfTwo(test_info.threadCount)); - } - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->double_ulps; - test_info.ftz = f->ftz || gForceFTZ; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, - test_info.threadCount, - test_info.k, - test_info.programs, - f->name, - f->nameInCode, - relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input arrays - double *p = (double *)gIn; - double *p2 = (double *)gIn2; - for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) - { - p[j] = DoubleFromUInt32(genrand_int32(d)); - p2[j] = DoubleFromUInt32(genrand_int32(d)); - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - BUFFER_SIZE, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) - / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if ((error = clSetKernelArg(test_info.k[j][0], 0, - sizeof(gOutBuffer[j]), &gOutBuffer[j]))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 2, - sizeof(gInBuffer2), &gInBuffer2))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (i = 0; i < PERF_LOOP_COUNT; i++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], - 1, NULL, &localCount, NULL, - 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (BUFFER_SIZE / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - if (!gSkipCorrectnessTesting) - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - vlog("\n"); - -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof(cl_double); - cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - dptr func = job->f->dfunc; - int ftz = job->ftz; - bool relaxedMode = job->relaxedMode; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_ulong *t; - cl_double *r; - cl_double *s; - cl_double *s2; - - Force64BitFPUPrecision(); - - // start the map of the output arrays - cl_event e[VECTOR_SIZE_COUNT]; - cl_ulong *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_ulong *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - - // Init input array - cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements; - cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements; - j = 0; - int totalSpecialValueCount = - specialValuesDoubleCount * specialValuesDoubleCount; - int indx = (totalSpecialValueCount - 1) / buffer_elements; - - if (job_id <= (cl_uint)indx) - { // test edge cases - cl_double *fp = (cl_double *)p; - cl_double *fp2 = (cl_double *)p2; - uint32_t x, y; - - x = (job_id * buffer_elements) % specialValuesDoubleCount; - y = (job_id * buffer_elements) / specialValuesDoubleCount; - - for (; j < buffer_elements; j++) - { - fp[j] = specialValuesDouble[x]; - fp2[j] = specialValuesDouble[y]; - if (++x >= specialValuesDoubleCount) - { - x = 0; - y++; - if (y >= specialValuesDoubleCount) break; - } - } - } - - // Init any remaining values. - for (; j < buffer_elements; j++) - { - p[j] = genrand_int64(d); - p2[j] = genrand_int64(d); - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, - buffer_size, p, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, - buffer_size, p2, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - goto exit; - } - if ((error = clReleaseEvent(e[j]))) - { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - goto exit; - } - - // Fill the result buffer with garbage, so that old results don't carry - // over - uint32_t pattern = 0xffffdead; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - goto exit; - } - - // run the kernel - size_t vectorCount = - (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its - // own copy of the cl_kernel - cl_program program = job->programs[j]; - - if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), - &tinfo->outBuf[j]))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), - &tinfo->inBuf))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), - &tinfo->inBuf2))) - { - LogBuildError(program); - return error; - } - - if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, - &vectorCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - - if (gSkipCorrectnessTesting) return CL_SUCCESS; - - // Calculate the correctly rounded reference result - r = (cl_double *)gOut_Ref + thread_id * buffer_elements; - s = (cl_double *)gIn + thread_id * buffer_elements; - s2 = (cl_double *)gIn2 + thread_id * buffer_elements; - for (j = 0; j < buffer_elements; j++) - r[j] = (cl_double)func.f_ff(s[j], s2[j]); - - // Read the data back -- no need to wait for the first N-1 buffers. This is - // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_ulong *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - goto exit; - } - } - - // Wait for the last buffer - out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], - CL_TRUE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; - } - - // Verify data - t = (cl_ulong *)r; - for (j = 0; j < buffer_elements; j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - cl_ulong *q = out[k]; - - // If we aren't getting the correctly rounded result - if (t[j] != q[j]) - { - cl_double test = ((cl_double *)q)[j]; - long double correct = func.f_ff(s[j], s2[j]); - float err = Bruteforce_Ulp_Error_Double(test, correct); - int fail = !(fabsf(err) <= ulps); - - if (fail && ftz) - { - // retry per section 6.5.3.2 - if (IsDoubleResultSubnormal(correct, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - - // retry per section 6.5.3.3 - if (IsDoubleSubnormal(s[j])) - { - long double correct2 = func.f_ff(0.0, s2[j]); - long double correct3 = func.f_ff(-0.0, s2[j]); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, ulps) - || IsDoubleResultSubnormal(correct3, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // try with both args as zero - if (IsDoubleSubnormal(s2[j])) - { - correct2 = func.f_ff(0.0, 0.0); - correct3 = func.f_ff(-0.0, 0.0); - long double correct4 = func.f_ff(0.0, -0.0); - long double correct5 = func.f_ff(-0.0, -0.0); - err2 = Bruteforce_Ulp_Error_Double(test, correct2); - err3 = Bruteforce_Ulp_Error_Double(test, correct3); - float err4 = - Bruteforce_Ulp_Error_Double(test, correct4); - float err5 = - Bruteforce_Ulp_Error_Double(test, correct5); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps)) - && (!(fabsf(err4) <= ulps)) - && (!(fabsf(err5) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, ulps) - || IsDoubleResultSubnormal(correct3, ulps) - || IsDoubleResultSubnormal(correct4, ulps) - || IsDoubleResultSubnormal(correct5, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - else if (IsDoubleSubnormal(s2[j])) - { - long double correct2 = func.f_ff(s[j], 0.0); - long double correct3 = func.f_ff(s[j], -0.0); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, ulps) - || IsDoubleResultSubnormal(correct3, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - - if (fabsf(err) > tinfo->maxError) - { - tinfo->maxError = fabsf(err); - tinfo->maxErrorValue = s[j]; - tinfo->maxErrorValue2 = s2[j]; - } - if (fail) - { - vlog_error( - "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n", - name, sizeNames[k], err, s[j], s2[j], r[j], test); - error = -1; - goto exit; - } - } - } - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", - j, error); - return error; - } - } - - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - - - if (0 == (base & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " - "ThreadCount:%2u\n", - base, job->step, job->scale, buffer_elements, job->ulps, - job->threadCount); - } - else - { - vlog("."); - } - fflush(stdout); - } - -exit: - return error; -} diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp deleted file mode 100644 index a20c0571..00000000 --- a/test_conformance/math_brute_force/binary_two_results_i.cpp +++ /dev/null @@ -1,1298 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include "function_list.h" -#include "test_functions.h" -#include "utility.h" - -#include -#include - -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float", - sizeNames[vectorSize], - "* out, __global int", - sizeNames[vectorSize], - "* out2, __global float", - sizeNames[vectorSize], - "* in1, __global float", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], out2 + i );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float* out, __global int* out2, __global float* in, " - "__global float* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " int3 i0 = 0xdeaddead;\n" - " f0 = ", - name, - "( f0, f1, &i0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( i0, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " float3 f0;\n" - " float3 f1;\n" - " int3 i0 = 0xdeaddead;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0, f1, &i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - -static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global int", - sizeNames[vectorSize], - "* out2, __global double", - sizeNames[vectorSize], - "* in1, __global double", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], out2 + i );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global int* out2, __global double* in, " - "__global double* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " int3 i0 = 0xdeaddead;\n" - " d0 = ", - name, - "( d0, d1, &i0 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " vstore3( i0, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 d0;\n" - " double3 d1;\n" - " int3 i0 = 0xdeaddead;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, d1, &i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " out2[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " out2[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - -typedef struct ComputeReferenceInfoF_ -{ - const float *x; - const float *y; - float *r; - int *i; - double (*f_ffpI)(double, double, int *); - cl_uint lim; - cl_uint count; -} ComputeReferenceInfoF; - -typedef struct ComputeReferenceInfoD_ -{ - const double *x; - const double *y; - double *r; - int *i; - long double (*f_ffpI)(long double, long double, int *); - cl_uint lim; - cl_uint count; -} ComputeReferenceInfoD; - -static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) -{ - ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; - cl_uint lim = cri->lim; - cl_uint count = cri->count; - cl_uint off = jid * count; - const float *x = cri->x + off; - const float *y = cri->y + off; - float *r = cri->r + off; - int *i = cri->i + off; - double (*f)(double, double, int *) = cri->f_ffpI; - cl_uint j; - - if (off + count > lim) count = lim - off; - - for (j = 0; j < count; ++j) - r[j] = (float)f((double)x[j], (double)y[j], i + j); - - return CL_SUCCESS; -} - -static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) -{ - ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo; - cl_uint lim = cri->lim; - cl_uint count = cri->count; - cl_uint off = jid * count; - const double *x = cri->x + off; - const double *y = cri->y + off; - double *r = cri->r + off; - int *i = cri->i + off; - long double (*f)(long double, long double, int *) = cri->f_ffpI; - cl_uint j; - - if (off + count > lim) count = lim - off; - - Force64BitFPUPrecision(); - - for (j = 0; j < count; ++j) - r[j] = (double)f((long double)x[j], (long double)y[j], i + j); - - return CL_SUCCESS; -} - -int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - float maxError = 0.0f; - int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - int64_t maxError2 = 0; - float maxErrorVal = 0.0f; - float maxErrorVal2 = 0.0f; - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(float), bufferSize); - - cl_uint threadCount = GetThreadCount(); - - float float_ulps; - if (gIsEmbedded) - float_ulps = f->float_embedded_ulps; - else - float_ulps = f->float_ulps; - - int testingRemquo = !strcmp(f->name, "remquo"); - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_FloatFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - cl_uint *p = (cl_uint *)gIn; - cl_uint *p2 = (cl_uint *)gIn2; - for (j = 0; j < bufferSize / sizeof(float); j++) - { - p[j] = genrand_int32(d); - p2[j] = genrand_int32(d); - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - - memset_pattern4(gOut2[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_float) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - float *s = (float *)gIn; - float *s2 = (float *)gIn2; - - if (threadCount > 1) - { - ComputeReferenceInfoF cri; - cri.x = s; - cri.y = s2; - cri.r = (float *)gOut_Ref; - cri.i = (int *)gOut_Ref2; - cri.f_ffpI = f->func.f_ffpI; - cri.lim = bufferSize / sizeof(float); - cri.count = (cri.lim + threadCount - 1) / threadCount; - ThreadPool_Do(ReferenceF, threadCount, &cri); - } - else - { - float *r = (float *)gOut_Ref; - int *r2 = (int *)gOut_Ref2; - for (j = 0; j < bufferSize / sizeof(float); j++) - r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j); - } - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("ReadArray2 failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data - uint32_t *t = (uint32_t *)gOut_Ref; - int32_t *t2 = (int32_t *)gOut_Ref2; - for (j = 0; j < bufferSize / sizeof(float); j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - uint32_t *q = (uint32_t *)(gOut[k]); - int32_t *q2 = (int32_t *)gOut2[k]; - - // Check for exact match to correctly rounded result - if (t[j] == q[j] && t2[j] == q2[j]) continue; - - // Check for paired NaNs - if ((t[j] & 0x7fffffff) > 0x7f800000 - && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j]) - continue; - - float test = ((float *)q)[j]; - int correct2 = INT_MIN; - double correct = f->func.f_ffpI(s[j], s2[j], &correct2); - float err = Ulp_Error(test, correct); - int64_t iErr; - - // in case of remquo, we only care about the sign and last - // seven bits of integer as per the spec. - if (testingRemquo) - iErr = (long long)(q2[j] & 0x0000007f) - - (long long)(correct2 & 0x0000007f); - else - iErr = (long long)q2[j] - (long long)correct2; - - // For remquo, if y = 0, x is infinite, or either is NaN - // then the standard either neglects to say what is returned - // in iptr or leaves it undefined or implementation defined. - int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY - || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j]) - || isnan(((float *)gIn)[j]); - if (iptrUndefined) iErr = 0; - - int fail = !(fabsf(err) <= float_ulps && iErr == 0); - if (ftz && fail) - { - // retry per section 6.5.3.2 - if (IsFloatResultSubnormal(correct, float_ulps)) - { - fail = fail && !(test == 0.0f && iErr == 0); - if (!fail) err = 0.0f; - } - - // retry per section 6.5.3.3 - if (IsFloatSubnormal(s[j])) - { - int correct3i, correct4i; - double correct3 = - f->func.f_ffpI(0.0, s2[j], &correct3i); - double correct4 = - f->func.f_ffpI(-0.0, s2[j], &correct4i); - float err2 = Ulp_Error(test, correct3); - float err3 = Ulp_Error(test, correct4); - int64_t iErr3 = (long long)q2[j] - (long long)correct3i; - int64_t iErr4 = (long long)q2[j] - (long long)correct4i; - fail = fail - && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) - && (!(fabsf(err3) <= float_ulps - && iErr4 == 0))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; - if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; - - // retry per section 6.5.3.4 - if (IsFloatResultSubnormal(correct2, float_ulps) - || IsFloatResultSubnormal(correct3, float_ulps)) - { - fail = fail - && !(test == 0.0f - && (iErr3 == 0 || iErr4 == 0)); - if (!fail) err = 0.0f; - } - - // try with both args as zero - if (IsFloatSubnormal(s2[j])) - { - int correct7i, correct8i; - correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i); - correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i); - double correct7 = - f->func.f_ffpI(0.0, -0.0, &correct7i); - double correct8 = - f->func.f_ffpI(-0.0, -0.0, &correct8i); - err2 = Ulp_Error(test, correct3); - err3 = Ulp_Error(test, correct4); - float err4 = Ulp_Error(test, correct7); - float err5 = Ulp_Error(test, correct8); - iErr3 = (long long)q2[j] - (long long)correct3i; - iErr4 = (long long)q2[j] - (long long)correct4i; - int64_t iErr7 = - (long long)q2[j] - (long long)correct7i; - int64_t iErr8 = - (long long)q2[j] - (long long)correct8i; - fail = fail - && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) - && (!(fabsf(err3) <= float_ulps - && iErr4 == 0)) - && (!(fabsf(err4) <= float_ulps - && iErr7 == 0)) - && (!(fabsf(err5) <= float_ulps - && iErr8 == 0))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; - if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; - if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; - if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; - - // retry per section 6.5.3.4 - if (IsFloatResultSubnormal(correct3, float_ulps) - || IsFloatResultSubnormal(correct4, float_ulps) - || IsFloatResultSubnormal(correct7, float_ulps) - || IsFloatResultSubnormal(correct8, float_ulps)) - { - fail = fail - && !(test == 0.0f - && (iErr3 == 0 || iErr4 == 0 - || iErr7 == 0 || iErr8 == 0)); - if (!fail) err = 0.0f; - } - } - } - else if (IsFloatSubnormal(s2[j])) - { - int correct3i, correct4i; - double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i); - double correct4 = - f->func.f_ffpI(s[j], -0.0, &correct4i); - float err2 = Ulp_Error(test, correct3); - float err3 = Ulp_Error(test, correct4); - int64_t iErr3 = (long long)q2[j] - (long long)correct3i; - int64_t iErr4 = (long long)q2[j] - (long long)correct4i; - fail = fail - && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) - && (!(fabsf(err3) <= float_ulps - && iErr4 == 0))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; - if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; - - // retry per section 6.5.3.4 - if (IsFloatResultSubnormal(correct2, float_ulps) - || IsFloatResultSubnormal(correct3, float_ulps)) - { - fail = fail - && !(test == 0.0f - && (iErr3 == 0 || iErr4 == 0)); - if (!fail) err = 0.0f; - } - } - } - if (fabsf(err) > maxError) - { - maxError = fabsf(err); - maxErrorVal = s[j]; - } - if (llabs(iErr) > maxError2) - { - maxError2 = llabs(iErr); - maxErrorVal2 = s[j]; - } - - if (fail) - { - vlog_error( - "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} " - "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, " - "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n", - f->name, sizeNames[k], err, iErr, ((float *)gIn)[j], - ((float *)gIn2)[j], ((cl_uint *)gIn)[j], - ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j], - ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j], - ((cl_uint *)gOut_Ref2)[j], test, q2[j], - ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]); - error = -1; - goto exit; - } - } - } - - if (0 == (i & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); - } - else - { - vlog("."); - } - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - cl_uint *p = (cl_uint *)gIn; - for (j = 0; j < bufferSize / sizeof(float); j++) - { - p[j] = genrand_int32(d); - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_float) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(float)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", - f->name, sizeNames[j]); - } - } - - if (!gSkipCorrectnessTesting) - vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); - vlog("\n"); - -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -} - -int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - float maxError = 0.0f; - int64_t maxError2 = 0; - int ftz = f->ftz || gForceFTZ; - double maxErrorVal = 0.0f; - double maxErrorVal2 = 0.0f; - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(double), bufferSize); - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - cl_uint threadCount = GetThreadCount(); - - Force64BitFPUPrecision(); - - int testingRemquo = !strcmp(f->name, "remquo"); - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - double *p = (double *)gIn; - double *p2 = (double *)gIn2; - for (j = 0; j < bufferSize / sizeof(double); j++) - { - p[j] = DoubleFromUInt32(genrand_int32(d)); - p2[j] = DoubleFromUInt32(genrand_int32(d)); - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - - memset_pattern4(gOut2[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - double *s = (double *)gIn; - double *s2 = (double *)gIn2; - - if (threadCount > 1) - { - ComputeReferenceInfoD cri; - cri.x = s; - cri.y = s2; - cri.r = (double *)gOut_Ref; - cri.i = (int *)gOut_Ref2; - cri.f_ffpI = f->dfunc.f_ffpI; - cri.lim = bufferSize / sizeof(double); - cri.count = (cri.lim + threadCount - 1) / threadCount; - ThreadPool_Do(ReferenceD, threadCount, &cri); - } - else - { - double *r = (double *)gOut_Ref; - int *r2 = (int *)gOut_Ref2; - for (j = 0; j < bufferSize / sizeof(double); j++) - r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j); - } - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("ReadArray2 failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data - uint64_t *t = (uint64_t *)gOut_Ref; - int32_t *t2 = (int32_t *)gOut_Ref2; - for (j = 0; j < bufferSize / sizeof(double); j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - uint64_t *q = (uint64_t *)gOut[k]; - int32_t *q2 = (int32_t *)gOut2[k]; - - // Check for exact match to correctly rounded result - if (t[j] == q[j] && t2[j] == q2[j]) continue; - - // Check for paired NaNs - if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL - && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL - && t2[j] == q2[j]) - continue; - - double test = ((double *)q)[j]; - int correct2 = INT_MIN; - long double correct = f->dfunc.f_ffpI(s[j], s2[j], &correct2); - float err = Bruteforce_Ulp_Error_Double(test, correct); - int64_t iErr; - - // in case of remquo, we only care about the sign and last - // seven bits of integer as per the spec. - if (testingRemquo) - iErr = (long long)(q2[j] & 0x0000007f) - - (long long)(correct2 & 0x0000007f); - else - iErr = (long long)q2[j] - (long long)correct2; - - // For remquo, if y = 0, x is infinite, or either is NaN - // then the standard either neglects to say what is returned - // in iptr or leaves it undefined or implementation defined. - int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY - || ((double *)gIn2)[j] == 0.0 || isnan(((double *)gIn2)[j]) - || isnan(((double *)gIn)[j]); - if (iptrUndefined) iErr = 0; - - int fail = !(fabsf(err) <= f->double_ulps && iErr == 0); - if (ftz && fail) - { - // retry per section 6.5.3.2 - if (IsDoubleResultSubnormal(correct, f->double_ulps)) - { - fail = fail && !(test == 0.0f && iErr == 0); - if (!fail) err = 0.0f; - } - - // retry per section 6.5.3.3 - if (IsDoubleSubnormal(s[j])) - { - int correct3i, correct4i; - long double correct3 = - f->dfunc.f_ffpI(0.0, s2[j], &correct3i); - long double correct4 = - f->dfunc.f_ffpI(-0.0, s2[j], &correct4i); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct3); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct4); - int64_t iErr3 = (long long)q2[j] - (long long)correct3i; - int64_t iErr4 = (long long)q2[j] - (long long)correct4i; - fail = fail - && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) - && (!(fabsf(err3) <= f->double_ulps - && iErr4 == 0))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; - if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps)) - { - fail = fail - && !(test == 0.0f - && (iErr3 == 0 || iErr4 == 0)); - if (!fail) err = 0.0f; - } - - // try with both args as zero - if (IsDoubleSubnormal(s2[j])) - { - int correct7i, correct8i; - correct3 = f->dfunc.f_ffpI(0.0, 0.0, &correct3i); - correct4 = f->dfunc.f_ffpI(-0.0, 0.0, &correct4i); - long double correct7 = - f->dfunc.f_ffpI(0.0, -0.0, &correct7i); - long double correct8 = - f->dfunc.f_ffpI(-0.0, -0.0, &correct8i); - err2 = Bruteforce_Ulp_Error_Double(test, correct3); - err3 = Bruteforce_Ulp_Error_Double(test, correct4); - float err4 = - Bruteforce_Ulp_Error_Double(test, correct7); - float err5 = - Bruteforce_Ulp_Error_Double(test, correct8); - iErr3 = (long long)q2[j] - (long long)correct3i; - iErr4 = (long long)q2[j] - (long long)correct4i; - int64_t iErr7 = - (long long)q2[j] - (long long)correct7i; - int64_t iErr8 = - (long long)q2[j] - (long long)correct8i; - fail = fail - && ((!(fabsf(err2) <= f->double_ulps - && iErr3 == 0)) - && (!(fabsf(err3) <= f->double_ulps - && iErr4 == 0)) - && (!(fabsf(err4) <= f->double_ulps - && iErr7 == 0)) - && (!(fabsf(err5) <= f->double_ulps - && iErr8 == 0))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; - if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; - if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; - if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct3, - f->double_ulps) - || IsDoubleResultSubnormal(correct4, - f->double_ulps) - || IsDoubleResultSubnormal(correct7, - f->double_ulps) - || IsDoubleResultSubnormal(correct8, - f->double_ulps)) - { - fail = fail - && !(test == 0.0f - && (iErr3 == 0 || iErr4 == 0 - || iErr7 == 0 || iErr8 == 0)); - if (!fail) err = 0.0f; - } - } - } - else if (IsDoubleSubnormal(s2[j])) - { - int correct3i, correct4i; - long double correct3 = - f->dfunc.f_ffpI(s[j], 0.0, &correct3i); - long double correct4 = - f->dfunc.f_ffpI(s[j], -0.0, &correct4i); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct3); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct4); - int64_t iErr3 = (long long)q2[j] - (long long)correct3i; - int64_t iErr4 = (long long)q2[j] - (long long)correct4i; - fail = fail - && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) - && (!(fabsf(err3) <= f->double_ulps - && iErr4 == 0))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; - if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps)) - { - fail = fail - && !(test == 0.0f - && (iErr3 == 0 || iErr4 == 0)); - if (!fail) err = 0.0f; - } - } - } - if (fabsf(err) > maxError) - { - maxError = fabsf(err); - maxErrorVal = s[j]; - } - if (llabs(iErr) > maxError2) - { - maxError2 = llabs(iErr); - maxErrorVal2 = s[j]; - } - - if (fail) - { - vlog_error( - "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, " - "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, " - "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ " - "0x%16.16llx, 0x%8.8x})\n", - f->name, sizeNames[k], err, iErr, - ((double *)gIn)[j], ((double *)gIn2)[j], - ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j], - ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j], - ((cl_ulong *)gOut_Ref)[j], - ((cl_uint *)gOut_Ref2)[j], test, q2[j], - ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]); - error = -1; - goto exit; - } - } - } - - if (0 == (i & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); - } - else - { - vlog("."); - } - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - double *p = (double *)gIn; - for (j = 0; j < bufferSize / sizeof(cl_double); j++) - p[j] = DoubleFromUInt32(genrand_int32(d)); - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - if (!gSkipCorrectnessTesting) - vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); - vlog("\n"); - -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -} diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp new file mode 100644 index 00000000..5f1ba3b2 --- /dev/null +++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp @@ -0,0 +1,671 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global int", + sizeNames[vectorSize], + "* out2, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], out2 + i );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global int* out2, __global double* in, " + "__global double* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " int3 i0 = 0xdeaddead;\n" + " d0 = ", + name, + "( d0, d1, &i0 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " vstore3( i0, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0;\n" + " double3 d1;\n" + " int3 i0 = 0xdeaddead;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, d1, &i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " out2[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " out2[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +typedef struct ComputeReferenceInfoD_ +{ + const double *x; + const double *y; + double *r; + int *i; + long double (*f_ffpI)(long double, long double, int *); + cl_uint lim; + cl_uint count; +} ComputeReferenceInfoD; + +static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) +{ + ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo; + cl_uint lim = cri->lim; + cl_uint count = cri->count; + cl_uint off = jid * count; + const double *x = cri->x + off; + const double *y = cri->y + off; + double *r = cri->r + off; + int *i = cri->i + off; + long double (*f)(long double, long double, int *) = cri->f_ffpI; + cl_uint j; + + if (off + count > lim) count = lim - off; + + Force64BitFPUPrecision(); + + for (j = 0; j < count; ++j) + r[j] = (double)f((long double)x[j], (long double)y[j], i + j); + + return CL_SUCCESS; +} + +int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + int64_t maxError2 = 0; + int ftz = f->ftz || gForceFTZ; + double maxErrorVal = 0.0f; + double maxErrorVal2 = 0.0f; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(double), bufferSize); + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + cl_uint threadCount = GetThreadCount(); + + Force64BitFPUPrecision(); + + int testingRemquo = !strcmp(f->name, "remquo"); + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + double *p = (double *)gIn; + double *p2 = (double *)gIn2; + for (j = 0; j < bufferSize / sizeof(double); j++) + { + p[j] = DoubleFromUInt32(genrand_int32(d)); + p2[j] = DoubleFromUInt32(genrand_int32(d)); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + + memset_pattern4(gOut2[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + double *s = (double *)gIn; + double *s2 = (double *)gIn2; + + if (threadCount > 1) + { + ComputeReferenceInfoD cri; + cri.x = s; + cri.y = s2; + cri.r = (double *)gOut_Ref; + cri.i = (int *)gOut_Ref2; + cri.f_ffpI = f->dfunc.f_ffpI; + cri.lim = bufferSize / sizeof(double); + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceD, threadCount, &cri); + } + else + { + double *r = (double *)gOut_Ref; + int *r2 = (int *)gOut_Ref2; + for (j = 0; j < bufferSize / sizeof(double); j++) + r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j); + } + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint64_t *t = (uint64_t *)gOut_Ref; + int32_t *t2 = (int32_t *)gOut_Ref2; + for (j = 0; j < bufferSize / sizeof(double); j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint64_t *q = (uint64_t *)gOut[k]; + int32_t *q2 = (int32_t *)gOut2[k]; + + // Check for exact match to correctly rounded result + if (t[j] == q[j] && t2[j] == q2[j]) continue; + + // Check for paired NaNs + if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL + && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL + && t2[j] == q2[j]) + continue; + + double test = ((double *)q)[j]; + int correct2 = INT_MIN; + long double correct = f->dfunc.f_ffpI(s[j], s2[j], &correct2); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int64_t iErr; + + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); + else + iErr = (long long)q2[j] - (long long)correct2; + + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY + || ((double *)gIn2)[j] == 0.0 || isnan(((double *)gIn2)[j]) + || isnan(((double *)gIn)[j]); + if (iptrUndefined) iErr = 0; + + int fail = !(fabsf(err) <= f->double_ulps && iErr == 0); + if (ftz && fail) + { + // retry per section 6.5.3.2 + if (IsDoubleResultSubnormal(correct, f->double_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsDoubleSubnormal(s[j])) + { + int correct3i, correct4i; + long double correct3 = + f->dfunc.f_ffpI(0.0, s2[j], &correct3i); + long double correct4 = + f->dfunc.f_ffpI(-0.0, s2[j], &correct4i); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= f->double_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsDoubleSubnormal(s2[j])) + { + int correct7i, correct8i; + correct3 = f->dfunc.f_ffpI(0.0, 0.0, &correct3i); + correct4 = f->dfunc.f_ffpI(-0.0, 0.0, &correct4i); + long double correct7 = + f->dfunc.f_ffpI(0.0, -0.0, &correct7i); + long double correct8 = + f->dfunc.f_ffpI(-0.0, -0.0, &correct8i); + err2 = Bruteforce_Ulp_Error_Double(test, correct3); + err3 = Bruteforce_Ulp_Error_Double(test, correct4); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct7); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= f->double_ulps + && iErr3 == 0)) + && (!(fabsf(err3) <= f->double_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= f->double_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= f->double_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct7, + f->double_ulps) + || IsDoubleResultSubnormal(correct8, + f->double_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; + } + } + } + else if (IsDoubleSubnormal(s2[j])) + { + int correct3i, correct4i; + long double correct3 = + f->dfunc.f_ffpI(s[j], 0.0, &correct3i); + long double correct4 = + f->dfunc.f_ffpI(s[j], -0.0, &correct4i); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= f->double_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = s[j]; + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = s[j]; + } + + if (fail) + { + vlog_error( + "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, " + "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, " + "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ " + "0x%16.16llx, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, ((double *)gIn)[j], + ((double *)gIn2)[j], ((cl_ulong *)gIn)[j], + ((cl_ulong *)gIn2)[j], ((double *)gOut_Ref)[j], + ((int *)gOut_Ref2)[j], ((cl_ulong *)gOut_Ref)[j], + ((cl_uint *)gOut_Ref2)[j], test, q2[j], + ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]); + error = -1; + goto exit; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + double *p = (double *)gIn; + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp new file mode 100644 index 00000000..4ea7a85d --- /dev/null +++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp @@ -0,0 +1,657 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global int", + sizeNames[vectorSize], + "* out2, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], out2 + i );\n" + "}\n" }; + + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global int* out2, __global float* in, " + "__global float* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 f1 = vload3( 0, in2 + 3 * i );\n" + " int3 i0 = 0xdeaddead;\n" + " f0 = ", + name, + "( f0, f1, &i0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( i0, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " float3 f1;\n" + " int3 i0 = 0xdeaddead;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " f1 = (float3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, f1, &i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernel(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +typedef struct ComputeReferenceInfoF_ +{ + const float *x; + const float *y; + float *r; + int *i; + double (*f_ffpI)(double, double, int *); + cl_uint lim; + cl_uint count; +} ComputeReferenceInfoF; + +static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +{ + ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; + cl_uint lim = cri->lim; + cl_uint count = cri->count; + cl_uint off = jid * count; + const float *x = cri->x + off; + const float *y = cri->y + off; + float *r = cri->r + off; + int *i = cri->i + off; + double (*f)(double, double, int *) = cri->f_ffpI; + cl_uint j; + + if (off + count > lim) count = lim - off; + + for (j = 0; j < count; ++j) + r[j] = (float)f((double)x[j], (double)y[j], i + j); + + return CL_SUCCESS; +} + +int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + int64_t maxError2 = 0; + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(float), bufferSize); + + cl_uint threadCount = GetThreadCount(); + + float float_ulps; + if (gIsEmbedded) + float_ulps = f->float_embedded_ulps; + else + float_ulps = f->float_ulps; + + int testingRemquo = !strcmp(f->name, "remquo"); + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_uint *p = (cl_uint *)gIn; + cl_uint *p2 = (cl_uint *)gIn2; + for (j = 0; j < bufferSize / sizeof(float); j++) + { + p[j] = genrand_int32(d); + p2[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + + memset_pattern4(gOut2[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + float *s = (float *)gIn; + float *s2 = (float *)gIn2; + + if (threadCount > 1) + { + ComputeReferenceInfoF cri; + cri.x = s; + cri.y = s2; + cri.r = (float *)gOut_Ref; + cri.i = (int *)gOut_Ref2; + cri.f_ffpI = f->func.f_ffpI; + cri.lim = bufferSize / sizeof(float); + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceF, threadCount, &cri); + } + else + { + float *r = (float *)gOut_Ref; + int *r2 = (int *)gOut_Ref2; + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j); + } + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + int32_t *t2 = (int32_t *)gOut_Ref2; + for (j = 0; j < bufferSize / sizeof(float); j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint32_t *q = (uint32_t *)(gOut[k]); + int32_t *q2 = (int32_t *)gOut2[k]; + + // Check for exact match to correctly rounded result + if (t[j] == q[j] && t2[j] == q2[j]) continue; + + // Check for paired NaNs + if ((t[j] & 0x7fffffff) > 0x7f800000 + && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j]) + continue; + + float test = ((float *)q)[j]; + int correct2 = INT_MIN; + double correct = f->func.f_ffpI(s[j], s2[j], &correct2); + float err = Ulp_Error(test, correct); + int64_t iErr; + + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); + else + iErr = (long long)q2[j] - (long long)correct2; + + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY + || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j]) + || isnan(((float *)gIn)[j]); + if (iptrUndefined) iErr = 0; + + int fail = !(fabsf(err) <= float_ulps && iErr == 0); + if (ftz && fail) + { + // retry per section 6.5.3.2 + if (IsFloatResultSubnormal(correct, float_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsFloatSubnormal(s[j])) + { + int correct3i, correct4i; + double correct3 = + f->func.f_ffpI(0.0, s2[j], &correct3i); + double correct4 = + f->func.f_ffpI(-0.0, s2[j], &correct4i); + float err2 = Ulp_Error(test, correct3); + float err3 = Ulp_Error(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= float_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsFloatResultSubnormal(correct2, float_ulps) + || IsFloatResultSubnormal(correct3, float_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsFloatSubnormal(s2[j])) + { + int correct7i, correct8i; + correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i); + correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i); + double correct7 = + f->func.f_ffpI(0.0, -0.0, &correct7i); + double correct8 = + f->func.f_ffpI(-0.0, -0.0, &correct8i); + err2 = Ulp_Error(test, correct3); + err3 = Ulp_Error(test, correct4); + float err4 = Ulp_Error(test, correct7); + float err5 = Ulp_Error(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= float_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= float_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= float_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; + + // retry per section 6.5.3.4 + if (IsFloatResultSubnormal(correct3, float_ulps) + || IsFloatResultSubnormal(correct4, float_ulps) + || IsFloatResultSubnormal(correct7, float_ulps) + || IsFloatResultSubnormal(correct8, float_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; + } + } + } + else if (IsFloatSubnormal(s2[j])) + { + int correct3i, correct4i; + double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i); + double correct4 = + f->func.f_ffpI(s[j], -0.0, &correct4i); + float err2 = Ulp_Error(test, correct3); + float err3 = Ulp_Error(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= float_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= float_ulps + && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsFloatResultSubnormal(correct2, float_ulps) + || IsFloatResultSubnormal(correct3, float_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = s[j]; + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = s[j]; + } + + if (fail) + { + vlog_error( + "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} " + "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, " + "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, ((float *)gIn)[j], + ((float *)gIn2)[j], ((cl_uint *)gIn)[j], + ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j], + ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j], + ((cl_uint *)gOut_Ref2)[j], test, q2[j], + ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]); + error = -1; + goto exit; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + cl_uint *p = (cl_uint *)gIn; + for (j = 0; j < bufferSize / sizeof(float); j++) + { + p[j] = genrand_int32(d); + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); + } + } + + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary_double.cpp similarity index 52% rename from test_conformance/math_brute_force/i_unary.cpp rename to test_conformance/math_brute_force/i_unary_double.cpp index 9418d44d..8cb863b3 100644 --- a/test_conformance/math_brute_force/i_unary.cpp +++ b/test_conformance/math_brute_force/i_unary_double.cpp @@ -20,84 +20,6 @@ #include -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global int", - sizeNames[vectorSize], - "* out, __global float", - sizeNames[vectorSize], - "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global int* out, __global float* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = ", - name, - "( f0 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " int3 i0 = ", - name, - "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { @@ -187,15 +109,6 @@ typedef struct BuildKernelInfo bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. } BuildKernelInfo; -static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -205,259 +118,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, info->programs + i, info->relaxedMode); } -int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(float), bufferSize); - int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1); - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // This test is not using ThreadPool so we need to disable FTZ here - // for reference computations - FPU_mode_type oldMode; - DisableFTZ(&oldMode); - - Force64BitFPUPrecision(); - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_FloatFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - cl_uint *p = (cl_uint *)gIn; - if (gWimpyMode) - { - for (j = 0; j < bufferSize / sizeof(float); j++) - p[j] = (cl_uint)i + j * scale; - } - else - { - for (j = 0; j < bufferSize / sizeof(float); j++) - p[j] = (uint32_t)i + j; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeValues[j] * sizeof(cl_float); - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - int *r = (int *)gOut_Ref; - float *s = (float *)gIn; - for (j = 0; j < bufferSize / sizeof(float); j++) - r[j] = f->func.i_f(s[j]); - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data - uint32_t *t = (uint32_t *)gOut_Ref; - for (j = 0; j < bufferSize / sizeof(float); j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - uint32_t *q = (uint32_t *)(gOut[k]); - // If we aren't getting the correctly rounded result - if (t[j] != q[j]) - { - if (ftz && IsFloatSubnormal(s[j])) - { - unsigned int correct0 = f->func.i_f(0.0); - unsigned int correct1 = f->func.i_f(-0.0); - if (q[j] == correct0 || q[j] == correct1) continue; - } - - uint32_t err = t[j] - q[j]; - if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): " - "*%d vs. %d\n", - f->name, sizeNames[k], err, ((float *)gIn)[j], - ((cl_uint *)gIn)[j], t[j], q[j]); - error = -1; - goto exit; - } - } - } - - if (0 == (i & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); - } - else - { - vlog("."); - } - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - uint32_t *p = (uint32_t *)gIn; - for (j = 0; j < bufferSize / sizeof(float); j++) - p[j] = genrand_int32(d); - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_float) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(float)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", - f->name, sizeNames[j]); - } - } - - vlog("\n"); - -exit: - RestoreFPState(&oldMode); - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -} - int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode) { uint64_t i; diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp new file mode 100644 index 00000000..feecb54c --- /dev/null +++ b/test_conformance/math_brute_force/i_unary_float.cpp @@ -0,0 +1,370 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int* out, __global float* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = ", + name, + "( f0 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " int3 i0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernel(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(float), bufferSize); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1); + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // This test is not using ThreadPool so we need to disable FTZ here + // for reference computations + FPU_mode_type oldMode; + DisableFTZ(&oldMode); + + Force64BitFPUPrecision(); + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_uint *p = (cl_uint *)gIn; + if (gWimpyMode) + { + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (cl_uint)i + j * scale; + } + else + { + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_float); + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + int *r = (int *)gOut_Ref; + float *s = (float *)gIn; + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = f->func.i_f(s[j]); + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + for (j = 0; j < bufferSize / sizeof(float); j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint32_t *q = (uint32_t *)(gOut[k]); + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + if (ftz && IsFloatSubnormal(s[j])) + { + unsigned int correct0 = f->func.i_f(0.0); + unsigned int correct1 = f->func.i_f(-0.0); + if (q[j] == correct0 || q[j] == correct1) continue; + } + + uint32_t err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): " + "*%d vs. %d\n", + f->name, sizeNames[k], err, ((float *)gIn)[j], + ((cl_uint *)gIn)[j], t[j], q[j]); + error = -1; + goto exit; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + uint32_t *p = (uint32_t *)gIn; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = genrand_int32(d); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); + } + } + + vlog("\n"); + +exit: + RestoreFPState(&oldMode); + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp similarity index 53% rename from test_conformance/math_brute_force/macro_binary.cpp rename to test_conformance/math_brute_force/macro_binary_double.cpp index fb88e607..9b5d8f24 100644 --- a/test_conformance/math_brute_force/macro_binary.cpp +++ b/test_conformance/math_brute_force/macro_binary_double.cpp @@ -20,91 +20,6 @@ #include -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global int", - sizeNames[vectorSize], - "* out, __global float", - sizeNames[vectorSize], - "* in1, __global float", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i] );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global int* out, __global float* in, __global float* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " int3 i0 = ", - name, - "( f0, f1 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " float3 f0;\n" - " float3 f1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " int3 i0 = ", - name, - "( f0, f1 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - static int BuildKernelDouble(const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) @@ -203,15 +118,6 @@ typedef struct BuildKernelInfo bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. } BuildKernelInfo; -static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); -} - static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -222,112 +128,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, info->relaxedMode); } -// A table of more difficult cases to get right -static const float specialValuesFloat[] = { - -NAN, - -INFINITY, - -FLT_MAX, - MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), - MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), - MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), - MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), - MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), - MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), - MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), - MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), - MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), - MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), - MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), - MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), - -1000.f, - -100.f, - -4.0f, - -3.5f, - -3.0f, - MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), - -2.5f, - MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), - -2.0f, - MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), - -1.5f, - MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), - MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), - -1.0f, - MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), - MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), - -0.5f, - MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), - MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), - -0.25f, - MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), - MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), - -FLT_MIN, - MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), - MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), - MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), - MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), - MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), - MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), - MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), - MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), - MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), - MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), - -0.0f, - - +NAN, - +INFINITY, - +FLT_MAX, - MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), - MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), - MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), - MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), - MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), - MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), - MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), - MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), - MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), - MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), - MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), - MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), - +1000.f, - +100.f, - +4.0f, - +3.5f, - +3.0f, - MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), - 2.5f, - MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), - +2.0f, - MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), - 1.5f, - MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), - MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), - +1.0f, - MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), - MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), - +0.5f, - MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), - MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), - +0.25f, - MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), - MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), - +FLT_MIN, - MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), - MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), - MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), - MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), - MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), - MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), - MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), - MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), - MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), - MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), - +0.0f -}; - -static const size_t specialValuesFloatCount = - sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); - // Thread specific data for a worker thread typedef struct ThreadInfo { @@ -356,579 +156,6 @@ typedef struct TestInfo } TestInfo; -static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); - -int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - size_t i, j; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - if (gWimpyMode) - { - test_info.subBufferSize = gWimpyBufferSize - / (sizeof(cl_float) - * RoundUpToNextPowerOfTwo(test_info.threadCount)); - } - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gInBuffer for region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernel_FloatFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input arrays - cl_uint *p = (cl_uint *)gIn; - cl_uint *p2 = (cl_uint *)gIn2; - for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) - { - p[j] = genrand_int32(d); - p2[j] = genrand_int32(d); - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - BUFFER_SIZE, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_float) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) - / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if ((error = clSetKernelArg(test_info.k[j][0], 0, - sizeof(gOutBuffer[j]), &gOutBuffer[j]))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 2, - sizeof(gInBuffer2), &gInBuffer2))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (i = 0; i < PERF_LOOP_COUNT; i++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], - 1, NULL, &localCount, NULL, - 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (BUFFER_SIZE / sizeof(float)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", - f->name, sizeNames[j]); - } - } - - vlog("\n"); - -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof(cl_float); - cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - fptr func = job->f->func; - int ftz = job->ftz; - MTdata d = tinfo->d; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - cl_int *t = 0; - cl_int *r = 0; - cl_float *s = 0; - cl_float *s2 = 0; - - // start the map of the output arrays - cl_event e[VECTOR_SIZE_COUNT]; - cl_int *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_int *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - - // Init input array - cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; - cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; - j = 0; - - int totalSpecialValueCount = - specialValuesFloatCount * specialValuesFloatCount; - int indx = (totalSpecialValueCount - 1) / buffer_elements; - - if (job_id <= (cl_uint)indx) - { // test edge cases - float *fp = (float *)p; - float *fp2 = (float *)p2; - uint32_t x, y; - - x = (job_id * buffer_elements) % specialValuesFloatCount; - y = (job_id * buffer_elements) / specialValuesFloatCount; - - for (; j < buffer_elements; j++) - { - fp[j] = specialValuesFloat[x]; - fp2[j] = specialValuesFloat[y]; - ++x; - if (x >= specialValuesFloatCount) - { - x = 0; - y++; - if (y >= specialValuesFloatCount) break; - } - } - } - - // Init any remaining values. - for (; j < buffer_elements; j++) - { - p[j] = genrand_int32(d); - p2[j] = genrand_int32(d); - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, - buffer_size, p, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, - buffer_size, p2, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - goto exit; - } - if ((error = clReleaseEvent(e[j]))) - { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - goto exit; - } - - // Fill the result buffer with garbage, so that old results don't carry - // over - uint32_t pattern = 0xffffdead; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - goto exit; - } - - // run the kernel - size_t vectorCount = - (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its - // own copy of the cl_kernel - cl_program program = job->programs[j]; - - if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), - &tinfo->outBuf[j]))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), - &tinfo->inBuf))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), - &tinfo->inBuf2))) - { - LogBuildError(program); - return error; - } - - if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, - &vectorCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - - if (gSkipCorrectnessTesting) return CL_SUCCESS; - - // Calculate the correctly rounded reference result - r = (cl_int *)gOut_Ref + thread_id * buffer_elements; - s = (float *)gIn + thread_id * buffer_elements; - s2 = (float *)gIn2 + thread_id * buffer_elements; - for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]); - - - // Read the data back -- no need to wait for the first N-1 buffers. This is - // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_int *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - goto exit; - } - } - - // Wait for the last buffer - out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], - CL_TRUE, CL_MAP_READ, 0, buffer_size, - 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; - } - - // Verify data - t = (cl_int *)r; - for (j = 0; j < buffer_elements; j++) - { - cl_int *q = out[0]; - - if (gMinVectorSizeIndex == 0 && t[j] != q[j]) - { - if (ftz) - { - if (IsFloatSubnormal(s[j])) - { - if (IsFloatSubnormal(s2[j])) - { - int correct = func.i_ff(0.0f, 0.0f); - int correct2 = func.i_ff(0.0f, -0.0f); - int correct3 = func.i_ff(-0.0f, 0.0f); - int correct4 = func.i_ff(-0.0f, -0.0f); - - if (correct == q[j] || correct2 == q[j] - || correct3 == q[j] || correct4 == q[j]) - continue; - } - else - { - int correct = func.i_ff(0.0f, s2[j]); - int correct2 = func.i_ff(-0.0f, s2[j]); - if (correct == q[j] || correct2 == q[j]) continue; - } - } - else if (IsFloatSubnormal(s2[j])) - { - int correct = func.i_ff(s[j], 0.0f); - int correct2 = func.i_ff(s[j], -0.0f); - if (correct == q[j] || correct2 == q[j]) continue; - } - } - - uint32_t err = t[j] - q[j]; - if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. " - "0x%8.8x (index: %d)\n", - name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j], - j); - error = -1; - goto exit; - } - - for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) - { - q = out[k]; - // If we aren't getting the correctly rounded result - if (-t[j] != q[j]) - { - if (ftz) - { - if (IsFloatSubnormal(s[j])) - { - if (IsFloatSubnormal(s2[j])) - { - int correct = -func.i_ff(0.0f, 0.0f); - int correct2 = -func.i_ff(0.0f, -0.0f); - int correct3 = -func.i_ff(-0.0f, 0.0f); - int correct4 = -func.i_ff(-0.0f, -0.0f); - - if (correct == q[j] || correct2 == q[j] - || correct3 == q[j] || correct4 == q[j]) - continue; - } - else - { - int correct = -func.i_ff(0.0f, s2[j]); - int correct2 = -func.i_ff(-0.0f, s2[j]); - if (correct == q[j] || correct2 == q[j]) continue; - } - } - else if (IsFloatSubnormal(s2[j])) - { - int correct = -func.i_ff(s[j], 0.0f); - int correct2 = -func.i_ff(s[j], -0.0f); - if (correct == q[j] || correct2 == q[j]) continue; - } - } - cl_uint err = -t[j] - q[j]; - if (q[j] > -t[j]) err = q[j] + t[j]; - vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x " - "vs. 0x%8.8x (index: %d)\n", - name, sizeNames[k], err, ((float *)s)[j], - ((float *)s2)[j], -t[j], q[j], j); - error = -1; - goto exit; - } - } - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", - j, error); - return error; - } - } - - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - - - if (0 == (base & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " - "ThreadCount:%2u\n", - base, job->step, job->scale, buffer_elements, - job->threadCount); - } - else - { - vlog("."); - } - fflush(stdout); - } - -exit: - return error; -} - // A table of more difficult cases to get right static const double specialValuesDouble[] = { -NAN, diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp new file mode 100644 index 00000000..ece96037 --- /dev/null +++ b/test_conformance/math_brute_force/macro_binary_float.cpp @@ -0,0 +1,832 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) +{ + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; + + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int* out, __global float* in, __global float* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 f1 = vload3( 0, in2 + 3 * i );\n" + " int3 i0 = ", + name, + "( f0, f1 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " float3 f1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " f1 = (float3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " int3 i0 = ", + name, + "( f0, f1 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernel(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, info->relaxedMode); +} + +// A table of more difficult cases to get right +static const float specialValuesFloat[] = { + -NAN, + -INFINITY, + -FLT_MAX, + MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), + MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), + MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), + MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), + MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), + MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38), + MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), + MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), + MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), + MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), + MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), + MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), + -1000.f, + -100.f, + -4.0f, + -3.5f, + -3.0f, + MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), + -2.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), + -2.0f, + MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), + -1.5f, + MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), + MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), + -1.0f, + MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), + MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), + -0.5f, + MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), + MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), + -0.25f, + MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), + MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), + -FLT_MIN, + MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), + MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), + MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), + MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), + MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), + MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), + MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), + MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), + MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), + MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), + -0.0f, + + +NAN, + +INFINITY, + +FLT_MAX, + MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), + MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), + MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), + MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), + MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), + MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38), + MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), + MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), + MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), + MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), + MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), + MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), + +1000.f, + +100.f, + +4.0f, + +3.5f, + +3.0f, + MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), + 2.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), + +2.0f, + MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), + 1.5f, + MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), + MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), + +1.0f, + MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25), + MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), + +0.5f, + MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), + MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), + +0.25f, + MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27), + MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), + +FLT_MIN, + MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), + MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), + MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), + MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), + MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), + MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), + MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), + MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), + MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), + MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), + +0.0f +}; + +static const size_t specialValuesFloatCount = + sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode + +} TestInfo; + +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p); + +int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_float) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); + } + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gInBuffer for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { + gMinVectorSizeIndex, test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, relaxedMode + }; + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input arrays + cl_uint *p = (cl_uint *)gIn; + cl_uint *p2 = (cl_uint *)gIn2; + for (j = 0; j < BUFFER_SIZE / sizeof(float); j++) + { + p[j] = genrand_int32(d); + p2[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (i = 0; i < PERF_LOOP_COUNT; i++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); + } + } + + vlog("\n"); + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + free_mtdata(test_info.tinfo[i].d); + clReleaseMemObject(test_info.tinfo[i].inBuf); + clReleaseMemObject(test_info.tinfo[i].inBuf2); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_float); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_int *t = 0; + cl_int *r = 0; + cl_float *s = 0; + cl_float *s2 = 0; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_int *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_int *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Init input array + cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements; + cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements; + j = 0; + + int totalSpecialValueCount = + specialValuesFloatCount * specialValuesFloatCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + float *fp = (float *)p; + float *fp2 = (float *)p2; + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesFloatCount; + y = (job_id * buffer_elements) / specialValuesFloatCount; + + for (; j < buffer_elements; j++) + { + fp[j] = specialValuesFloat[x]; + fp2[j] = specialValuesFloat[y]; + ++x; + if (x >= specialValuesFloatCount) + { + x = 0; + y++; + if (y >= specialValuesFloatCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = genrand_int32(d); + p2[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + goto exit; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + goto exit; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xffffdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + goto exit; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_int *)gOut_Ref + thread_id * buffer_elements; + s = (float *)gIn + thread_id * buffer_elements; + s2 = (float *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]); + + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_int *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + + // Wait for the last buffer + out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + t = (cl_int *)r; + for (j = 0; j < buffer_elements; j++) + { + cl_int *q = out[0]; + + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + if (ftz) + { + if (IsFloatSubnormal(s[j])) + { + if (IsFloatSubnormal(s2[j])) + { + int correct = func.i_ff(0.0f, 0.0f); + int correct2 = func.i_ff(0.0f, -0.0f); + int correct3 = func.i_ff(-0.0f, 0.0f); + int correct4 = func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + int correct = func.i_ff(0.0f, s2[j]); + int correct2 = func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsFloatSubnormal(s2[j])) + { + int correct = func.i_ff(s[j], 0.0f); + int correct2 = func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + uint32_t err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. " + "0x%8.8x (index: %d)\n", + name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j], + j); + error = -1; + goto exit; + } + + for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsFloatSubnormal(s[j])) + { + if (IsFloatSubnormal(s2[j])) + { + int correct = -func.i_ff(0.0f, 0.0f); + int correct2 = -func.i_ff(0.0f, -0.0f); + int correct3 = -func.i_ff(-0.0f, 0.0f); + int correct4 = -func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + int correct = -func.i_ff(0.0f, s2[j]); + int correct2 = -func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsFloatSubnormal(s2[j])) + { + int correct = -func.i_ff(s[j], 0.0f); + int correct2 = -func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + cl_uint err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x " + "vs. 0x%8.8x (index: %d)\n", + name, sizeNames[k], err, ((float *)s)[j], + ((float *)s2)[j], -t[j], q[j], j); + error = -1; + goto exit; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + +exit: + return error; +} diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp new file mode 100644 index 00000000..8d80abb4 --- /dev/null +++ b/test_conformance/math_brute_force/macro_unary_double.cpp @@ -0,0 +1,598 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernelDouble(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global long", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global long* out, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " long3 l0 = ", + name, + "( d0 );\n" + " vstore3( l0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " long3 l0 = ", + name, + "( d0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = l0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = l0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode + +} TestInfo; + +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data); + +int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); + } + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = f->ftz || gForceFTZ; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Init the kernels + { + BuildKernelInfo build_info = { + gMinVectorSizeIndex, test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, relaxedMode + }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + cl_ulong *p = (cl_ulong *)gIn; + for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (i = 0; i < PERF_LOOP_COUNT; i++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + vlog("\n"); + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + clReleaseMemObject(test_info.tinfo[i].inBuf); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + dptr dfunc = job->f->dfunc; + int ftz = job->ftz; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + + Force64BitFPUPrecision(); + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_long *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_long *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Write the new values to the input array + cl_double *p = (cl_double *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + p[j] = DoubleFromUInt32(base + j * scale); + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xffffdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + return error; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements; + cl_double *s = (cl_double *)p; + for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]); + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_long *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Wait for the last buffer + out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + cl_long *t = (cl_long *)r; + for (j = 0; j < buffer_elements; j++) + { + cl_long *q = out[0]; + + // If we aren't getting the correctly rounded result + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + // If we aren't getting the correctly rounded result + if (ftz) + { + if (IsDoubleSubnormal(s[j])) + { + cl_long correct = dfunc.i_f(+0.0f); + cl_long correct2 = dfunc.i_f(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + cl_ulong err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", + name, err, ((double *)gIn)[j], t[j], q[j]); + return -1; + } + + + for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsDoubleSubnormal(s[j])) + { + int64_t correct = -dfunc.i_f(+0.0f); + int64_t correct2 = -dfunc.i_f(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + cl_ulong err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error( + "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", + name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]); + return -1; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp similarity index 52% rename from test_conformance/math_brute_force/macro_unary.cpp rename to test_conformance/math_brute_force/macro_unary_float.cpp index e5aa9e70..2a37c95b 100644 --- a/test_conformance/math_brute_force/macro_unary.cpp +++ b/test_conformance/math_brute_force/macro_unary_float.cpp @@ -100,88 +100,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -static int BuildKernelDouble(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global long", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* in )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global long* out, __global double* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " long3 l0 = ", - name, - "( d0 );\n" - " vstore3( l0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 d0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " long3 l0 = ", - name, - "( d0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = l0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = l0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - typedef struct BuildKernelInfo { cl_uint offset; // the first vector size to build @@ -201,16 +119,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} - // Thread specific data for a worker thread typedef struct ThreadInfo { @@ -699,452 +607,3 @@ exit: return ret; } - -static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data); - -int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - size_t i, j; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - if (gWimpyMode) - { - test_info.subBufferSize = gWimpyBufferSize - / (sizeof(cl_double) - * RoundUpToNextPowerOfTwo(test_info.threadCount)); - } - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = f->ftz || gForceFTZ; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - cl_ulong *p = (cl_ulong *)gIn; - for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++) - p[j] = DoubleFromUInt32(genrand_int32(d)); - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) - / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if ((error = clSetKernelArg(test_info.k[j][0], 0, - sizeof(gOutBuffer[j]), &gOutBuffer[j]))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (i = 0; i < PERF_LOOP_COUNT; i++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], - 1, NULL, &localCount, NULL, - 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (BUFFER_SIZE / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - vlog("\n"); - -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof(cl_double); - cl_uint scale = job->scale; - cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - dptr dfunc = job->f->dfunc; - int ftz = job->ftz; - cl_uint j, k; - cl_int error; - const char *name = job->f->name; - - Force64BitFPUPrecision(); - - // start the map of the output arrays - cl_event e[VECTOR_SIZE_COUNT]; - cl_long *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_long *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - - // Write the new values to the input array - cl_double *p = (cl_double *)gIn + thread_id * buffer_elements; - for (j = 0; j < buffer_elements; j++) - p[j] = DoubleFromUInt32(base + j * scale); - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, - buffer_size, p, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) - { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; - } - - // Fill the result buffer with garbage, so that old results don't carry - // over - uint32_t pattern = 0xffffdead; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - return error; - } - - // run the kernel - size_t vectorCount = - (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its - // own copy of the cl_kernel - cl_program program = job->programs[j]; - - if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), - &tinfo->outBuf[j]))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), - &tinfo->inBuf))) - { - LogBuildError(program); - return error; - } - - if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, - &vectorCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - return error; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - - if (gSkipCorrectnessTesting) return CL_SUCCESS; - - // Calculate the correctly rounded reference result - cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements; - cl_double *s = (cl_double *)p; - for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]); - - // Read the data back -- no need to wait for the first N-1 buffers. This is - // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_long *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - - // Wait for the last buffer - out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], - CL_TRUE, CL_MAP_READ, 0, buffer_size, - 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - return error; - } - - // Verify data - cl_long *t = (cl_long *)r; - for (j = 0; j < buffer_elements; j++) - { - cl_long *q = out[0]; - - // If we aren't getting the correctly rounded result - if (gMinVectorSizeIndex == 0 && t[j] != q[j]) - { - // If we aren't getting the correctly rounded result - if (ftz) - { - if (IsDoubleSubnormal(s[j])) - { - cl_long correct = dfunc.i_f(+0.0f); - cl_long correct2 = dfunc.i_f(-0.0f); - if (correct == q[j] || correct2 == q[j]) continue; - } - } - - cl_ulong err = t[j] - q[j]; - if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", - name, err, ((double *)gIn)[j], t[j], q[j]); - return -1; - } - - - for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) - { - q = out[k]; - // If we aren't getting the correctly rounded result - if (-t[j] != q[j]) - { - if (ftz) - { - if (IsDoubleSubnormal(s[j])) - { - int64_t correct = -dfunc.i_f(+0.0f); - int64_t correct2 = -dfunc.i_f(-0.0f); - if (correct == q[j] || correct2 == q[j]) continue; - } - } - - cl_ulong err = -t[j] - q[j]; - if (q[j] > -t[j]) err = q[j] + t[j]; - vlog_error( - "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", - name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]); - return -1; - } - } - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", - j, error); - return error; - } - } - - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - - - if (0 == (base & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " - "ThreadCount:%2u\n", - base, job->step, job->scale, buffer_elements, - job->threadCount); - } - else - { - vlog("."); - } - fflush(stdout); - } - - return CL_SUCCESS; -} diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad_double.cpp similarity index 52% rename from test_conformance/math_brute_force/mad.cpp rename to test_conformance/math_brute_force/mad_double.cpp index 0d8c6d44..cbbc1951 100644 --- a/test_conformance/math_brute_force/mad.cpp +++ b/test_conformance/math_brute_force/mad_double.cpp @@ -20,97 +20,6 @@ #include -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float", - sizeNames[vectorSize], - "* out, __global float", - sizeNames[vectorSize], - "* in1, __global float", - sizeNames[vectorSize], - "* in2, __global float", - sizeNames[vectorSize], - "* in3 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], in3[i] );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float* out, __global float* in, __global float* in2, " - "__global float* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " float3 f2 = vload3( 0, in3 + 3 * i );\n" - " f0 = ", - name, - "( f0, f1, f2 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " float3 f0;\n" - " float3 f1;\n" - " float3 f2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " f2 = (float3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0, f1, f2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { @@ -213,15 +122,6 @@ typedef struct BuildKernelInfo bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. } BuildKernelInfo; -static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -231,278 +131,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, info->programs + i, info->relaxedMode); } -int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - float maxError = 0.0f; - float maxErrorVal = 0.0f; - float maxErrorVal2 = 0.0f; - float maxErrorVal3 = 0.0f; - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(float), bufferSize); - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_FloatFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - cl_uint *p = (cl_uint *)gIn; - cl_uint *p2 = (cl_uint *)gIn2; - cl_uint *p3 = (cl_uint *)gIn3; - for (j = 0; j < bufferSize / sizeof(float); j++) - { - p[j] = genrand_int32(d); - p2[j] = genrand_int32(d); - p3[j] = genrand_int32(d); - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, - bufferSize, gIn3, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_float) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), - &gInBuffer3))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - float *r = (float *)gOut_Ref; - float *s = (float *)gIn; - float *s2 = (float *)gIn2; - float *s3 = (float *)gIn3; - for (j = 0; j < bufferSize / sizeof(float); j++) - r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]); - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data -- No verification possible. - // MAD is a random number generator. - if (0 == (i & 0x0fffffff)) - { - vlog("."); - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - cl_uint *p = (cl_uint *)gIn; - cl_uint *p2 = (cl_uint *)gIn2; - cl_uint *p3 = (cl_uint *)gIn3; - for (j = 0; j < bufferSize / sizeof(float); j++) - { - p[j] = genrand_int32(d); - p2[j] = genrand_int32(d); - p3[j] = genrand_int32(d); - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, - bufferSize, gIn3, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_float) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), - &gInBuffer3))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(float)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", - f->name, sizeNames[j]); - } - } - - if (!gSkipCorrectnessTesting) - vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, - maxErrorVal3); - vlog("\n"); - -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -} - int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode) { uint64_t i; diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp new file mode 100644 index 00000000..2124b268 --- /dev/null +++ b/test_conformance/math_brute_force/mad_float.cpp @@ -0,0 +1,402 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global float", + sizeNames[vectorSize], + "* in1, __global float", + sizeNames[vectorSize], + "* in2, __global float", + sizeNames[vectorSize], + "* in3 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], in3[i] );\n" + "}\n" }; + + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global float* in, __global float* in2, " + "__global float* in3)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " float3 f1 = vload3( 0, in2 + 3 * i );\n" + " float3 f2 = vload3( 0, in3 + 3 * i );\n" + " f0 = ", + name, + "( f0, f1, f2 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " float3 f0;\n" + " float3 f1;\n" + " float3 f2;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " f1 = (float3)( in2[3*i], NAN, NAN ); \n" + " f2 = (float3)( in3[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" + " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, f1, f2 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernel(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + float maxErrorVal3 = 0.0f; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(float), bufferSize); + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_uint *p = (cl_uint *)gIn; + cl_uint *p2 = (cl_uint *)gIn2; + cl_uint *p3 = (cl_uint *)gIn3; + for (j = 0; j < bufferSize / sizeof(float); j++) + { + p[j] = genrand_int32(d); + p2[j] = genrand_int32(d); + p3[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + float *r = (float *)gOut_Ref; + float *s = (float *)gIn; + float *s2 = (float *)gIn2; + float *s3 = (float *)gIn3; + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]); + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data -- No verification possible. + // MAD is a random number generator. + if (0 == (i & 0x0fffffff)) + { + vlog("."); + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + cl_uint *p = (cl_uint *)gIn; + cl_uint *p2 = (cl_uint *)gIn2; + cl_uint *p3 = (cl_uint *)gIn3; + for (j = 0; j < bufferSize / sizeof(float); j++) + { + p[j] = genrand_int32(d); + p2[j] = genrand_int32(d); + p3[j] = genrand_int32(d); + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_float) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); + } + } + + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp new file mode 100644 index 00000000..427f4efd --- /dev/null +++ b/test_conformance/math_brute_force/ternary_double.cpp @@ -0,0 +1,842 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +#define CORRECTLY_ROUNDED 0 +#define FLUSHED 1 + +static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in1, __global double", + sizeNames[vectorSize], + "* in2, __global double", + sizeNames[vectorSize], + "* in3 )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], in3[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in, __global double* in2, " + "__global double* in3)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 d0 = vload3( 0, in + 3 * i );\n" + " double3 d1 = vload3( 0, in2 + 3 * i );\n" + " double3 d2 = vload3( 0, in3 + 3 * i );\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 d0;\n" + " double3 d1;\n" + " double3 d2;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (double3)( in[3*i], NAN, NAN ); \n" + " d1 = (double3)( in2[3*i], NAN, NAN ); \n" + " d2 = (double3)( in3[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" + " d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +// A table of more difficult cases to get right +static const double specialValuesDouble[] = { + -NAN, + -INFINITY, + -DBL_MAX, + MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), + -3.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), + -2.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), + -2.0, + MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), + -1.5, + MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + -1.0, + MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), + -DBL_MIN, + MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), + -0.0, + + +NAN, + +INFINITY, + +DBL_MAX, + MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), + MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), + MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), + MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), + +3.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), + +2.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), + +2.0, + MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), + +1.5, + MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), + MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), + +1.0, + MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), + MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), + +DBL_MIN, + MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), + MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), + MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), + MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), + +0.0, +}; + +static const size_t specialValuesDoubleCount = + sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); + +int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, + bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ; + double maxErrorVal = 0.0f; + double maxErrorVal2 = 0.0f; + double maxErrorVal3 = 0.0f; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(double), bufferSize); + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + Force64BitFPUPrecision(); + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + double *p = (double *)gIn; + double *p2 = (double *)gIn2; + double *p3 = (double *)gIn3; + j = 0; + if (i == 0) + { // test edge cases + uint32_t x, y, z; + x = y = z = 0; + for (; j < bufferSize / sizeof(double); j++) + { + p[j] = specialValuesDouble[x]; + p2[j] = specialValuesDouble[y]; + p3[j] = specialValuesDouble[z]; + if (++x >= specialValuesDoubleCount) + { + x = 0; + if (++y >= specialValuesDoubleCount) + { + y = 0; + if (++z >= specialValuesDoubleCount) break; + } + } + } + if (j == bufferSize / sizeof(double)) + vlog_error("Test Error: not all special cases tested!\n"); + } + + for (; j < bufferSize / sizeof(double); j++) + { + p[j] = DoubleFromUInt32(genrand_int32(d)); + p2[j] = DoubleFromUInt32(genrand_int32(d)); + p3[j] = DoubleFromUInt32(genrand_int32(d)); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + double *r = (double *)gOut_Ref; + double *s = (double *)gIn; + double *s2 = (double *)gIn2; + double *s3 = (double *)gIn3; + for (j = 0; j < bufferSize / sizeof(double); j++) + r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]); + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint64_t *t = (uint64_t *)gOut_Ref; + for (j = 0; j < bufferSize / sizeof(double); j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint64_t *q = (uint64_t *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double test = ((double *)q)[j]; + long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= f->double_ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsDoubleSubnormal(correct)) + { // look at me, + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (fail && IsDoubleSubnormal(s[j])) + { // look at me, + long double correct2 = + f->dfunc.f_fff(0.0, s2[j], s3[j]); + long double correct3 = + f->dfunc.f_fff(-0.0, s2[j], s3[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) + { // look at me now, + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // try with first two args as zero + if (IsDoubleSubnormal(s2[j])) + { // its fun to have fun, + correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]); + correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]); + long double correct4 = + f->dfunc.f_fff(0.0, -0.0, s3[j]); + long double correct5 = + f->dfunc.f_fff(-0.0, -0.0, s3[j]); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps)) + && (!(fabsf(err4) <= f->double_ulps)) + && (!(fabsf(err5) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct5, + f->double_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + if (IsDoubleSubnormal(s3[j])) + { // but you have to know how! + correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f); + correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f); + correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f); + correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f); + long double correct6 = + f->dfunc.f_fff(0.0, 0.0, -0.0f); + long double correct7 = + f->dfunc.f_fff(-0.0, 0.0, -0.0f); + long double correct8 = + f->dfunc.f_fff(0.0, -0.0, -0.0f); + long double correct9 = + f->dfunc.f_fff(-0.0, -0.0, -0.0f); + err2 = Bruteforce_Ulp_Error_Double( + test, correct2); + err3 = Bruteforce_Ulp_Error_Double( + test, correct3); + err4 = Bruteforce_Ulp_Error_Double( + test, correct4); + err5 = Bruteforce_Ulp_Error_Double( + test, correct5); + float err6 = Bruteforce_Ulp_Error_Double( + test, correct6); + float err7 = Bruteforce_Ulp_Error_Double( + test, correct7); + float err8 = Bruteforce_Ulp_Error_Double( + test, correct8); + float err9 = Bruteforce_Ulp_Error_Double( + test, correct9); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) + <= f->double_ulps)) + && (!(fabsf(err4) + <= f->double_ulps)) + && (!(fabsf(err5) + <= f->double_ulps)) + && (!(fabsf(err5) + <= f->double_ulps)) + && (!(fabsf(err6) + <= f->double_ulps)) + && (!(fabsf(err7) + <= f->double_ulps)) + && (!(fabsf(err8) + <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (fabsf(err6) < fabsf(err)) err = err6; + if (fabsf(err7) < fabsf(err)) err = err7; + if (fabsf(err8) < fabsf(err)) err = err8; + if (fabsf(err9) < fabsf(err)) err = err9; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal( + correct3, f->double_ulps) + || IsDoubleResultSubnormal( + correct4, f->double_ulps) + || IsDoubleResultSubnormal( + correct5, f->double_ulps) + || IsDoubleResultSubnormal( + correct6, f->double_ulps) + || IsDoubleResultSubnormal( + correct7, f->double_ulps) + || IsDoubleResultSubnormal( + correct8, f->double_ulps) + || IsDoubleResultSubnormal( + correct9, f->double_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (IsDoubleSubnormal(s3[j])) + { + correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0); + correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0); + long double correct4 = + f->dfunc.f_fff(0.0, s2[j], -0.0); + long double correct5 = + f->dfunc.f_fff(-0.0, s2[j], -0.0); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps)) + && (!(fabsf(err4) <= f->double_ulps)) + && (!(fabsf(err5) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct5, + f->double_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (fail && IsDoubleSubnormal(s2[j])) + { + long double correct2 = + f->dfunc.f_fff(s[j], 0.0, s3[j]); + long double correct3 = + f->dfunc.f_fff(s[j], -0.0, s3[j]); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // try with second two args as zero + if (IsDoubleSubnormal(s3[j])) + { + correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0); + correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0); + long double correct4 = + f->dfunc.f_fff(s[j], 0.0, -0.0); + long double correct5 = + f->dfunc.f_fff(s[j], -0.0, -0.0); + err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + float err4 = + Bruteforce_Ulp_Error_Double(test, correct4); + float err5 = + Bruteforce_Ulp_Error_Double(test, correct5); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps)) + && (!(fabsf(err4) <= f->double_ulps)) + && (!(fabsf(err5) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps) + || IsDoubleResultSubnormal(correct4, + f->double_ulps) + || IsDoubleResultSubnormal(correct5, + f->double_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (fail && IsDoubleSubnormal(s3[j])) + { + long double correct2 = + f->dfunc.f_fff(s[j], s2[j], 0.0); + long double correct3 = + f->dfunc.f_fff(s[j], s2[j], -0.0); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= f->double_ulps)) + && (!(fabsf(err3) <= f->double_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, + f->double_ulps) + || IsDoubleResultSubnormal(correct3, + f->double_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = s[j]; + maxErrorVal2 = s2[j]; + maxErrorVal3 = s3[j]; + } + + if (fail) + { + vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, " + "%.13la, %.13la}: *%.13la vs. %.13la\n", + f->name, sizeNames[k], err, s[j], s2[j], + s3[j], ((double *)gOut_Ref)[j], test); + error = -1; + goto exit; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + double *p = (double *)gIn; + double *p2 = (double *)gIn2; + double *p3 = (double *)gIn3; + for (j = 0; j < bufferSize / sizeof(double); j++) + { + p[j] = DoubleFromUInt32(genrand_int32(d)); + p2[j] = DoubleFromUInt32(genrand_int32(d)); + p3[j] = DoubleFromUInt32(genrand_int32(d)); + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + if (!gSkipCorrectnessTesting) + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary_float.cpp similarity index 52% rename from test_conformance/math_brute_force/ternary.cpp rename to test_conformance/math_brute_force/ternary_float.cpp index f8908909..3b3bde7c 100644 --- a/test_conformance/math_brute_force/ternary.cpp +++ b/test_conformance/math_brute_force/ternary_float.cpp @@ -114,99 +114,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* in1, __global double", - sizeNames[vectorSize], - "* in2, __global double", - sizeNames[vectorSize], - "* in3 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], in3[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global double* in, __global double* in2, " - "__global double* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " double3 d2 = vload3( 0, in3 + 3 * i );\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 d0;\n" - " double3 d1;\n" - " double3 d2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " d2 = (double3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - typedef struct BuildKernelInfo { cl_uint offset; // the first vector size to build @@ -225,15 +132,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - // A table of more difficult cases to get right static const float specialValuesFloat[] = { -NAN, @@ -316,7 +214,6 @@ static const float specialValuesFloat[] = { static const size_t specialValuesFloatCount = sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]); - int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) { uint64_t i; @@ -1077,711 +974,3 @@ exit: return error; } - -// A table of more difficult cases to get right -static const double specialValuesDouble[] = { - -NAN, - -INFINITY, - -DBL_MAX, - MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), - MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), - MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), - MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), - -3.0, - MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), - -2.5, - MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), - -2.0, - MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), - -1.5, - MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), - MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), - -1.0, - MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), - -DBL_MIN, - MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), - MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), - MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), - MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), - -0.0, - - +NAN, - +INFINITY, - +DBL_MAX, - MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12), - MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), - MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), - MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), - +3.0, - MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), - +2.5, - MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), - +2.0, - MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), - +1.5, - MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), - MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), - +1.0, - MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), - MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), - +DBL_MIN, - MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), - MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), - MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), - MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), - +0.0, -}; - -static const size_t specialValuesDoubleCount = - sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]); - - -int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, - bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - float maxError = 0.0f; - int ftz = f->ftz || gForceFTZ; - double maxErrorVal = 0.0f; - double maxErrorVal2 = 0.0f; - double maxErrorVal3 = 0.0f; - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(double), bufferSize); - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - Force64BitFPUPrecision(); - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - double *p = (double *)gIn; - double *p2 = (double *)gIn2; - double *p3 = (double *)gIn3; - j = 0; - if (i == 0) - { // test edge cases - uint32_t x, y, z; - x = y = z = 0; - for (; j < bufferSize / sizeof(double); j++) - { - p[j] = specialValuesDouble[x]; - p2[j] = specialValuesDouble[y]; - p3[j] = specialValuesDouble[z]; - if (++x >= specialValuesDoubleCount) - { - x = 0; - if (++y >= specialValuesDoubleCount) - { - y = 0; - if (++z >= specialValuesDoubleCount) break; - } - } - } - if (j == bufferSize / sizeof(double)) - vlog_error("Test Error: not all special cases tested!\n"); - } - - for (; j < bufferSize / sizeof(double); j++) - { - p[j] = DoubleFromUInt32(genrand_int32(d)); - p2[j] = DoubleFromUInt32(genrand_int32(d)); - p3[j] = DoubleFromUInt32(genrand_int32(d)); - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, - bufferSize, gIn3, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), - &gInBuffer3))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - double *r = (double *)gOut_Ref; - double *s = (double *)gIn; - double *s2 = (double *)gIn2; - double *s3 = (double *)gIn3; - for (j = 0; j < bufferSize / sizeof(double); j++) - r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]); - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data - uint64_t *t = (uint64_t *)gOut_Ref; - for (j = 0; j < bufferSize / sizeof(double); j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - uint64_t *q = (uint64_t *)(gOut[k]); - - // If we aren't getting the correctly rounded result - if (t[j] != q[j]) - { - double test = ((double *)q)[j]; - long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]); - float err = Bruteforce_Ulp_Error_Double(test, correct); - int fail = !(fabsf(err) <= f->double_ulps); - - if (fail && ftz) - { - // retry per section 6.5.3.2 - if (IsDoubleSubnormal(correct)) - { // look at me, - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // retry per section 6.5.3.3 - if (fail && IsDoubleSubnormal(s[j])) - { // look at me, - long double correct2 = - f->dfunc.f_fff(0.0, s2[j], s3[j]); - long double correct3 = - f->dfunc.f_fff(-0.0, s2[j], s3[j]); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= f->double_ulps)) - && (!(fabsf(err3) <= f->double_ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, - f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps)) - { // look at me now, - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // try with first two args as zero - if (IsDoubleSubnormal(s2[j])) - { // its fun to have fun, - correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]); - correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]); - long double correct4 = - f->dfunc.f_fff(0.0, -0.0, s3[j]); - long double correct5 = - f->dfunc.f_fff(-0.0, -0.0, s3[j]); - err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - float err4 = - Bruteforce_Ulp_Error_Double(test, correct4); - float err5 = - Bruteforce_Ulp_Error_Double(test, correct5); - fail = fail - && ((!(fabsf(err2) <= f->double_ulps)) - && (!(fabsf(err3) <= f->double_ulps)) - && (!(fabsf(err4) <= f->double_ulps)) - && (!(fabsf(err5) <= f->double_ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, - f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps) - || IsDoubleResultSubnormal(correct4, - f->double_ulps) - || IsDoubleResultSubnormal(correct5, - f->double_ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - if (IsDoubleSubnormal(s3[j])) - { // but you have to know how! - correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f); - correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f); - correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f); - correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f); - long double correct6 = - f->dfunc.f_fff(0.0, 0.0, -0.0f); - long double correct7 = - f->dfunc.f_fff(-0.0, 0.0, -0.0f); - long double correct8 = - f->dfunc.f_fff(0.0, -0.0, -0.0f); - long double correct9 = - f->dfunc.f_fff(-0.0, -0.0, -0.0f); - err2 = Bruteforce_Ulp_Error_Double( - test, correct2); - err3 = Bruteforce_Ulp_Error_Double( - test, correct3); - err4 = Bruteforce_Ulp_Error_Double( - test, correct4); - err5 = Bruteforce_Ulp_Error_Double( - test, correct5); - float err6 = Bruteforce_Ulp_Error_Double( - test, correct6); - float err7 = Bruteforce_Ulp_Error_Double( - test, correct7); - float err8 = Bruteforce_Ulp_Error_Double( - test, correct8); - float err9 = Bruteforce_Ulp_Error_Double( - test, correct9); - fail = fail - && ((!(fabsf(err2) <= f->double_ulps)) - && (!(fabsf(err3) - <= f->double_ulps)) - && (!(fabsf(err4) - <= f->double_ulps)) - && (!(fabsf(err5) - <= f->double_ulps)) - && (!(fabsf(err5) - <= f->double_ulps)) - && (!(fabsf(err6) - <= f->double_ulps)) - && (!(fabsf(err7) - <= f->double_ulps)) - && (!(fabsf(err8) - <= f->double_ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - if (fabsf(err6) < fabsf(err)) err = err6; - if (fabsf(err7) < fabsf(err)) err = err7; - if (fabsf(err8) < fabsf(err)) err = err8; - if (fabsf(err9) < fabsf(err)) err = err9; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, - f->double_ulps) - || IsDoubleResultSubnormal( - correct3, f->double_ulps) - || IsDoubleResultSubnormal( - correct4, f->double_ulps) - || IsDoubleResultSubnormal( - correct5, f->double_ulps) - || IsDoubleResultSubnormal( - correct6, f->double_ulps) - || IsDoubleResultSubnormal( - correct7, f->double_ulps) - || IsDoubleResultSubnormal( - correct8, f->double_ulps) - || IsDoubleResultSubnormal( - correct9, f->double_ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - else if (IsDoubleSubnormal(s3[j])) - { - correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0); - correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0); - long double correct4 = - f->dfunc.f_fff(0.0, s2[j], -0.0); - long double correct5 = - f->dfunc.f_fff(-0.0, s2[j], -0.0); - err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - float err4 = - Bruteforce_Ulp_Error_Double(test, correct4); - float err5 = - Bruteforce_Ulp_Error_Double(test, correct5); - fail = fail - && ((!(fabsf(err2) <= f->double_ulps)) - && (!(fabsf(err3) <= f->double_ulps)) - && (!(fabsf(err4) <= f->double_ulps)) - && (!(fabsf(err5) <= f->double_ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, - f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps) - || IsDoubleResultSubnormal(correct4, - f->double_ulps) - || IsDoubleResultSubnormal(correct5, - f->double_ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - else if (fail && IsDoubleSubnormal(s2[j])) - { - long double correct2 = - f->dfunc.f_fff(s[j], 0.0, s3[j]); - long double correct3 = - f->dfunc.f_fff(s[j], -0.0, s3[j]); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= f->double_ulps)) - && (!(fabsf(err3) <= f->double_ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, - f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // try with second two args as zero - if (IsDoubleSubnormal(s3[j])) - { - correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0); - correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0); - long double correct4 = - f->dfunc.f_fff(s[j], 0.0, -0.0); - long double correct5 = - f->dfunc.f_fff(s[j], -0.0, -0.0); - err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - float err4 = - Bruteforce_Ulp_Error_Double(test, correct4); - float err5 = - Bruteforce_Ulp_Error_Double(test, correct5); - fail = fail - && ((!(fabsf(err2) <= f->double_ulps)) - && (!(fabsf(err3) <= f->double_ulps)) - && (!(fabsf(err4) <= f->double_ulps)) - && (!(fabsf(err5) <= f->double_ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, - f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps) - || IsDoubleResultSubnormal(correct4, - f->double_ulps) - || IsDoubleResultSubnormal(correct5, - f->double_ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - else if (fail && IsDoubleSubnormal(s3[j])) - { - long double correct2 = - f->dfunc.f_fff(s[j], s2[j], 0.0); - long double correct3 = - f->dfunc.f_fff(s[j], s2[j], -0.0); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= f->double_ulps)) - && (!(fabsf(err3) <= f->double_ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, - f->double_ulps) - || IsDoubleResultSubnormal(correct3, - f->double_ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - - if (fabsf(err) > maxError) - { - maxError = fabsf(err); - maxErrorVal = s[j]; - maxErrorVal2 = s2[j]; - maxErrorVal3 = s3[j]; - } - - if (fail) - { - vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, " - "%.13la, %.13la}: *%.13la vs. %.13la\n", - f->name, sizeNames[k], err, s[j], s2[j], - s3[j], ((double *)gOut_Ref)[j], test); - error = -1; - goto exit; - } - } - } - } - - if (0 == (i & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); - } - else - { - vlog("."); - } - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - double *p = (double *)gIn; - double *p2 = (double *)gIn2; - double *p3 = (double *)gIn3; - for (j = 0; j < bufferSize / sizeof(double); j++) - { - p[j] = DoubleFromUInt32(genrand_int32(d)); - p2[j] = DoubleFromUInt32(genrand_int32(d)); - p3[j] = DoubleFromUInt32(genrand_int32(d)); - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, - bufferSize, gIn2, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); - return error; - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, - bufferSize, gIn3, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (bufferSize + vectorSize - 1) - / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), - &gInBuffer2))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), - &gInBuffer3))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - if (!gSkipCorrectnessTesting) - vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, - maxErrorVal3); - vlog("\n"); - -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -} diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp new file mode 100644 index 00000000..b97b1943 --- /dev/null +++ b/test_conformance/math_brute_force/unary_double.cpp @@ -0,0 +1,662 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +#if defined(__APPLE__) +#include +#endif + +static int BuildKernelDouble(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* in )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 f0 = vload3( 0, in + 3 * i );\n" + " f0 = ", + name, + "( f0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double maxErrorValue; // position of the max error value. Init to 0. + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isRangeLimited; // 1 if the function is only to be evaluated over a + // range + float half_sin_cos_tan_limit; + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. +} TestInfo; + +static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_double); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + dptr func = job->f->dfunc; + cl_uint j, k; + cl_int error; + int ftz = job->ftz; + + Force64BitFPUPrecision(); + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ulong *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Write the new values to the input array + cl_double *p = (cl_double *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + p[j] = DoubleFromUInt32(base + j * scale); + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xffffdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + return error; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements; + cl_double *s = (cl_double *)p; + for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]); + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ulong *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + // Wait for the last buffer + out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + + // Verify data + cl_ulong *t = (cl_ulong *)r; + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ulong *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + cl_double test = ((cl_double *)q)[j]; + long double correct = func.f_f(s[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= ulps); + + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsDoubleResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsDoubleSubnormal(s[j])) + { + long double correct2 = func.f_f(0.0L); + long double correct3 = func.f_f(-0.0L); + float err2 = + Bruteforce_Ulp_Error_Double(test, correct2); + float err3 = + Bruteforce_Ulp_Error_Double(test, correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correct2, ulps) + || IsDoubleResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + } + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at %.13la " + "(0x%16.16llx): *%.13la vs. %.13la\n", + job->f->name, sizeNames[k], err, + ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j], + ((cl_double *)gOut_Ref)[j], test); + return -1; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, buffer_elements, job->scale, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return CL_SUCCESS; +} + +int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; +#if defined(__APPLE__) + struct timeval time_val; + gettimeofday(&time_val, NULL); + double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec; + double end_time; +#endif + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + if (gWimpyMode) + { + test_info.subBufferSize = gWimpyBufferSize + / (sizeof(cl_double) + * RoundUpToNextPowerOfTwo(test_info.threadCount)); + } + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->double_ulps; + test_info.ftz = f->ftz || gForceFTZ; + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Init the kernels + { + BuildKernelInfo build_info = { + gMinVectorSizeIndex, test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, relaxedMode + }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + +#if defined(__APPLE__) + gettimeofday(&time_val, NULL); + end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec; +#endif + + if (gMeasureTimes) + { + // Init input array + double *p = (double *)gIn; + + if (strstr(f->name, "exp")) + for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) + p[j] = (double)genrand_real1(d); + else if (strstr(f->name, "log")) + for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) + p[j] = fabs(DoubleFromUInt32(genrand_int32(d))); + else + for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_double) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(test_info.k[j][0], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(test_info.programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (i = 0; i < PERF_LOOP_COUNT; i++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], + 1, NULL, &localCount, NULL, + 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (BUFFER_SIZE / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + +#if defined(__APPLE__) + vlog("\t(%2.2f seconds)", end_time - start_time); +#endif + vlog("\n"); + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + clReleaseMemObject(test_info.tinfo[i].inBuf); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary_float.cpp similarity index 58% rename from test_conformance/math_brute_force/unary.cpp rename to test_conformance/math_brute_force/unary_float.cpp index dc6d56c1..4c1bd7ab 100644 --- a/test_conformance/math_brute_force/unary.cpp +++ b/test_conformance/math_brute_force/unary_float.cpp @@ -103,88 +103,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -static int BuildKernelDouble(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* in )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global double* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 f0 = vload3( 0, in + 3 * i );\n" - " f0 = ", - name, - "( f0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - typedef struct BuildKernelInfo { cl_uint offset; // the first vector size to build @@ -204,16 +122,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} - // Thread specific data for a worker thread typedef struct ThreadInfo { @@ -915,505 +823,3 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data) return CL_SUCCESS; } - - -static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; - size_t buffer_elements = job->subBufferSize; - size_t buffer_size = buffer_elements * sizeof(cl_double); - cl_uint scale = job->scale; - cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; - float ulps = job->ulps; - dptr func = job->f->dfunc; - cl_uint j, k; - cl_int error; - int ftz = job->ftz; - - Force64BitFPUPrecision(); - - // start the map of the output arrays - cl_event e[VECTOR_SIZE_COUNT]; - cl_ulong *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_ulong *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); - - // Write the new values to the input array - cl_double *p = (cl_double *)gIn + thread_id * buffer_elements; - for (j = 0; j < buffer_elements; j++) - p[j] = DoubleFromUInt32(base + j * scale); - - if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, - buffer_size, p, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) - { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; - } - - // Fill the result buffer with garbage, so that old results don't carry - // over - uint32_t pattern = 0xffffdead; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - return error; - } - - // run the kernel - size_t vectorCount = - (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; - cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its - // own copy of the cl_kernel - cl_program program = job->programs[j]; - - if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), - &tinfo->outBuf[j]))) - { - LogBuildError(program); - return error; - } - if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), - &tinfo->inBuf))) - { - LogBuildError(program); - return error; - } - - if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, - &vectorCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - return error; - } - } - - - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); - - if (gSkipCorrectnessTesting) return CL_SUCCESS; - - // Calculate the correctly rounded reference result - cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements; - cl_double *s = (cl_double *)p; - for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]); - - // Read the data back -- no need to wait for the first N-1 buffers. This is - // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) - { - out[j] = (cl_ulong *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; - } - } - // Wait for the last buffer - out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], - CL_TRUE, CL_MAP_READ, 0, - buffer_size, 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - return error; - } - - - // Verify data - cl_ulong *t = (cl_ulong *)r; - for (j = 0; j < buffer_elements; j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - cl_ulong *q = out[k]; - - // If we aren't getting the correctly rounded result - if (t[j] != q[j]) - { - cl_double test = ((cl_double *)q)[j]; - long double correct = func.f_f(s[j]); - float err = Bruteforce_Ulp_Error_Double(test, correct); - int fail = !(fabsf(err) <= ulps); - - if (fail) - { - if (ftz) - { - // retry per section 6.5.3.2 - if (IsDoubleResultSubnormal(correct, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - - // retry per section 6.5.3.3 - if (IsDoubleSubnormal(s[j])) - { - long double correct2 = func.f_f(0.0L); - long double correct3 = func.f_f(-0.0L); - float err2 = - Bruteforce_Ulp_Error_Double(test, correct2); - float err3 = - Bruteforce_Ulp_Error_Double(test, correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correct2, ulps) - || IsDoubleResultSubnormal(correct3, ulps)) - { - fail = fail && (test != 0.0f); - if (!fail) err = 0.0f; - } - } - } - } - if (fabsf(err) > tinfo->maxError) - { - tinfo->maxError = fabsf(err); - tinfo->maxErrorValue = s[j]; - } - if (fail) - { - vlog_error("\nERROR: %s%s: %f ulp error at %.13la " - "(0x%16.16llx): *%.13la vs. %.13la\n", - job->f->name, sizeNames[k], err, - ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j], - ((cl_double *)gOut_Ref)[j], test); - return -1; - } - } - } - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", - j, error); - return error; - } - } - - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - - - if (0 == (base & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f " - "ThreadCount:%2u\n", - base, job->step, buffer_elements, job->scale, job->ulps, - job->threadCount); - } - else - { - vlog("."); - } - fflush(stdout); - } - - return CL_SUCCESS; -} - -int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; -#if defined(__APPLE__) - struct timeval time_val; - gettimeofday(&time_val, NULL); - double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec; - double end_time; -#endif - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - if (gWimpyMode) - { - test_info.subBufferSize = gWimpyBufferSize - / (sizeof(cl_double) - * RoundUpToNextPowerOfTwo(test_info.threadCount)); - } - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->double_ulps; - test_info.ftz = f->ftz || gForceFTZ; - test_info.relaxedMode = relaxedMode; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - -#if defined(__APPLE__) - gettimeofday(&time_val, NULL); - end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec; -#endif - - if (gMeasureTimes) - { - // Init input array - double *p = (double *)gIn; - - if (strstr(f->name, "exp")) - for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) - p[j] = (double)genrand_real1(d); - else if (strstr(f->name, "log")) - for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) - p[j] = fabs(DoubleFromUInt32(genrand_int32(d))); - else - for (j = 0; j < BUFFER_SIZE / sizeof(double); j++) - p[j] = DoubleFromUInt32(genrand_int32(d)); - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeof(cl_double) * sizeValues[j]; - size_t localCount = (BUFFER_SIZE + vectorSize - 1) - / vectorSize; // BUFFER_SIZE / vectorSize rounded up - if ((error = clSetKernelArg(test_info.k[j][0], 0, - sizeof(gOutBuffer[j]), &gOutBuffer[j]))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(test_info.programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (i = 0; i < PERF_LOOP_COUNT; i++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], - 1, NULL, &localCount, NULL, - 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (BUFFER_SIZE / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); - -#if defined(__APPLE__) - vlog("\t(%2.2f seconds)", end_time - start_time); -#endif - vlog("\n"); - -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp new file mode 100644 index 00000000..779c653a --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_double.cpp @@ -0,0 +1,523 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global double", + sizeNames[vectorSize], + "* out2, __global double", + sizeNames[vectorSize], + "* in )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i], out2 + i );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global double* out2, __global double* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " double3 f0 = vload3( 0, in + 3 * i );\n" + " double3 iout = NAN;\n" + " f0 = ", + name, + "( f0, &iout );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( iout, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " double3 iout = NAN;\n" + " double3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (double3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, &iout );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = iout.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = iout.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError0 = 0.0f; + float maxError1 = 0.0f; + int ftz = f->ftz || gForceFTZ; + double maxErrorVal0 = 0.0f; + double maxErrorVal1 = 0.0f; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(cl_double), bufferSize); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1); + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + Force64BitFPUPrecision(); + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + double *p = (double *)gIn; + if (gWimpyMode) + { + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j * scale); + } + else + { + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + p[j] = DoubleFromUInt32((uint32_t)i + j); + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + + memset_pattern4(gOut2[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_double); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + double *r = (double *)gOut_Ref; + double *r2 = (double *)gOut_Ref2; + double *s = (double *)gIn; + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + { + long double dd; + r[j] = (double)f->dfunc.f_fpf(s[j], &dd); + r2[j] = (double)dd; + } + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint64_t *t = (uint64_t *)gOut_Ref; + uint64_t *t2 = (uint64_t *)gOut_Ref2; + for (j = 0; j < bufferSize / sizeof(double); j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint64_t *q = (uint64_t *)(gOut[k]); + uint64_t *q2 = (uint64_t *)(gOut2[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j] || t2[j] != q2[j]) + { + double test = ((double *)q)[j]; + double test2 = ((double *)q2)[j]; + long double correct2; + long double correct = f->dfunc.f_fpf(s[j], &correct2); + float err = Bruteforce_Ulp_Error_Double(test, correct); + float err2 = Bruteforce_Ulp_Error_Double(test2, correct2); + int fail = !(fabsf(err) <= f->double_ulps + && fabsf(err2) <= f->double_ulps); + if (ftz) + { + // retry per section 6.5.3.2 + if (IsDoubleResultSubnormal(correct, f->double_ulps)) + { + if (IsDoubleResultSubnormal(correct2, + f->double_ulps)) + { + fail = fail && !(test == 0.0f && test2 == 0.0f); + if (!fail) + { + err = 0.0f; + err2 = 0.0f; + } + } + else + { + fail = fail + && !(test == 0.0f + && fabsf(err2) <= f->double_ulps); + if (!fail) err = 0.0f; + } + } + else if (IsDoubleResultSubnormal(correct2, + f->double_ulps)) + { + fail = fail + && !(test2 == 0.0f + && fabsf(err) <= f->double_ulps); + if (!fail) err2 = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsDoubleSubnormal(s[j])) + { + long double correct2p, correct2n; + long double correctp = + f->dfunc.f_fpf(0.0, &correct2p); + long double correctn = + f->dfunc.f_fpf(-0.0, &correct2n); + float errp = + Bruteforce_Ulp_Error_Double(test, correctp); + float err2p = + Bruteforce_Ulp_Error_Double(test, correct2p); + float errn = + Bruteforce_Ulp_Error_Double(test, correctn); + float err2n = + Bruteforce_Ulp_Error_Double(test, correct2n); + fail = fail + && ((!(fabsf(errp) <= f->double_ulps)) + && (!(fabsf(err2p) <= f->double_ulps)) + && ((!(fabsf(errn) <= f->double_ulps)) + && (!(fabsf(err2n) + <= f->double_ulps)))); + if (fabsf(errp) < fabsf(err)) err = errp; + if (fabsf(errn) < fabsf(err)) err = errn; + if (fabsf(err2p) < fabsf(err2)) err2 = err2p; + if (fabsf(err2n) < fabsf(err2)) err2 = err2n; + + // retry per section 6.5.3.4 + if (IsDoubleResultSubnormal(correctp, + f->double_ulps) + || IsDoubleResultSubnormal(correctn, + f->double_ulps)) + { + if (IsDoubleResultSubnormal(correct2p, + f->double_ulps) + || IsDoubleResultSubnormal(correct2n, + f->double_ulps)) + { + fail = fail + && !(test == 0.0f && test2 == 0.0f); + if (!fail) err = err2 = 0.0f; + } + else + { + fail = fail + && !(test == 0.0f + && fabsf(err2) <= f->double_ulps); + if (!fail) err = 0.0f; + } + } + else if (IsDoubleResultSubnormal(correct2p, + f->double_ulps) + || IsDoubleResultSubnormal(correct2n, + f->double_ulps)) + { + fail = fail + && !(test2 == 0.0f + && (fabsf(err) <= f->double_ulps)); + if (!fail) err2 = 0.0f; + } + } + } + if (fabsf(err) > maxError0) + { + maxError0 = fabsf(err); + maxErrorVal0 = s[j]; + } + if (fabsf(err2) > maxError1) + { + maxError1 = fabsf(err2); + maxErrorVal1 = s[j]; + } + if (fail) + { + vlog_error( + "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: " + "*{%.13la, %.13la} vs. {%.13la, %.13la}\n", + f->name, sizeNames[k], err, err2, + ((double *)gIn)[j], ((double *)gOut_Ref)[j], + ((double *)gOut_Ref2)[j], test, test2); + error = -1; + goto exit; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + double *p = (double *)gIn; + for (j = 0; j < bufferSize / sizeof(double); j++) + p[j] = DoubleFromUInt32(genrand_int32(d)); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_double); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, + maxErrorVal1); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp similarity index 56% rename from test_conformance/math_brute_force/unary_two_results.cpp rename to test_conformance/math_brute_force/unary_two_results_float.cpp index accebd3a..cda80b47 100644 --- a/test_conformance/math_brute_force/unary_two_results.cpp +++ b/test_conformance/math_brute_force/unary_two_results_float.cpp @@ -105,93 +105,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* out2, __global double", - sizeNames[vectorSize], - "* in )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i], out2 + i );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global double* out2, __global double* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 f0 = vload3( 0, in + 3 * i );\n" - " double3 iout = NAN;\n" - " f0 = ", - name, - "( f0, &iout );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( iout, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 iout = NAN;\n" - " double3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (double3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0, &iout );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = iout.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = iout.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - typedef struct BuildKernelInfo { cl_uint offset; // the first vector size to build @@ -210,15 +123,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) { uint64_t i; @@ -752,400 +656,3 @@ exit: return error; } - -int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - float maxError0 = 0.0f; - float maxError1 = 0.0f; - int ftz = f->ftz || gForceFTZ; - double maxErrorVal0 = 0.0f; - double maxErrorVal1 = 0.0f; - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(cl_double), bufferSize); - int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1); - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - Force64BitFPUPrecision(); - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - double *p = (double *)gIn; - if (gWimpyMode) - { - for (j = 0; j < bufferSize / sizeof(cl_double); j++) - p[j] = DoubleFromUInt32((uint32_t)i + j * scale); - } - else - { - for (j = 0; j < bufferSize / sizeof(cl_double); j++) - p[j] = DoubleFromUInt32((uint32_t)i + j); - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - - memset_pattern4(gOut2[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeValues[j] * sizeof(cl_double); - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - double *r = (double *)gOut_Ref; - double *r2 = (double *)gOut_Ref2; - double *s = (double *)gIn; - for (j = 0; j < bufferSize / sizeof(cl_double); j++) - { - long double dd; - r[j] = (double)f->dfunc.f_fpf(s[j], &dd); - r2[j] = (double)dd; - } - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("ReadArray2 failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data - uint64_t *t = (uint64_t *)gOut_Ref; - uint64_t *t2 = (uint64_t *)gOut_Ref2; - for (j = 0; j < bufferSize / sizeof(double); j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - uint64_t *q = (uint64_t *)(gOut[k]); - uint64_t *q2 = (uint64_t *)(gOut2[k]); - - // If we aren't getting the correctly rounded result - if (t[j] != q[j] || t2[j] != q2[j]) - { - double test = ((double *)q)[j]; - double test2 = ((double *)q2)[j]; - long double correct2; - long double correct = f->dfunc.f_fpf(s[j], &correct2); - float err = Bruteforce_Ulp_Error_Double(test, correct); - float err2 = Bruteforce_Ulp_Error_Double(test2, correct2); - int fail = !(fabsf(err) <= f->double_ulps - && fabsf(err2) <= f->double_ulps); - if (ftz) - { - // retry per section 6.5.3.2 - if (IsDoubleResultSubnormal(correct, f->double_ulps)) - { - if (IsDoubleResultSubnormal(correct2, - f->double_ulps)) - { - fail = fail && !(test == 0.0f && test2 == 0.0f); - if (!fail) - { - err = 0.0f; - err2 = 0.0f; - } - } - else - { - fail = fail - && !(test == 0.0f - && fabsf(err2) <= f->double_ulps); - if (!fail) err = 0.0f; - } - } - else if (IsDoubleResultSubnormal(correct2, - f->double_ulps)) - { - fail = fail - && !(test2 == 0.0f - && fabsf(err) <= f->double_ulps); - if (!fail) err2 = 0.0f; - } - - // retry per section 6.5.3.3 - if (IsDoubleSubnormal(s[j])) - { - long double correct2p, correct2n; - long double correctp = - f->dfunc.f_fpf(0.0, &correct2p); - long double correctn = - f->dfunc.f_fpf(-0.0, &correct2n); - float errp = - Bruteforce_Ulp_Error_Double(test, correctp); - float err2p = - Bruteforce_Ulp_Error_Double(test, correct2p); - float errn = - Bruteforce_Ulp_Error_Double(test, correctn); - float err2n = - Bruteforce_Ulp_Error_Double(test, correct2n); - fail = fail - && ((!(fabsf(errp) <= f->double_ulps)) - && (!(fabsf(err2p) <= f->double_ulps)) - && ((!(fabsf(errn) <= f->double_ulps)) - && (!(fabsf(err2n) - <= f->double_ulps)))); - if (fabsf(errp) < fabsf(err)) err = errp; - if (fabsf(errn) < fabsf(err)) err = errn; - if (fabsf(err2p) < fabsf(err2)) err2 = err2p; - if (fabsf(err2n) < fabsf(err2)) err2 = err2n; - - // retry per section 6.5.3.4 - if (IsDoubleResultSubnormal(correctp, - f->double_ulps) - || IsDoubleResultSubnormal(correctn, - f->double_ulps)) - { - if (IsDoubleResultSubnormal(correct2p, - f->double_ulps) - || IsDoubleResultSubnormal(correct2n, - f->double_ulps)) - { - fail = fail - && !(test == 0.0f && test2 == 0.0f); - if (!fail) err = err2 = 0.0f; - } - else - { - fail = fail - && !(test == 0.0f - && fabsf(err2) <= f->double_ulps); - if (!fail) err = 0.0f; - } - } - else if (IsDoubleResultSubnormal(correct2p, - f->double_ulps) - || IsDoubleResultSubnormal(correct2n, - f->double_ulps)) - { - fail = fail - && !(test2 == 0.0f - && (fabsf(err) <= f->double_ulps)); - if (!fail) err2 = 0.0f; - } - } - } - if (fabsf(err) > maxError0) - { - maxError0 = fabsf(err); - maxErrorVal0 = s[j]; - } - if (fabsf(err2) > maxError1) - { - maxError1 = fabsf(err2); - maxErrorVal1 = s[j]; - } - if (fail) - { - vlog_error( - "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: " - "*{%.13la, %.13la} vs. {%.13la, %.13la}\n", - f->name, sizeNames[k], err, err2, - ((double *)gIn)[j], ((double *)gOut_Ref)[j], - ((double *)gOut_Ref2)[j], test, test2); - error = -1; - goto exit; - } - } - } - } - - if (0 == (i & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); - } - else - { - vlog("."); - } - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - double *p = (double *)gIn; - for (j = 0; j < bufferSize / sizeof(double); j++) - p[j] = DoubleFromUInt32(genrand_int32(d)); - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeValues[j] * sizeof(cl_double); - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - if (!gSkipCorrectnessTesting) - vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, - maxErrorVal1); - vlog("\n"); - -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -} diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp similarity index 52% rename from test_conformance/math_brute_force/unary_two_results_i.cpp rename to test_conformance/math_brute_force/unary_two_results_i_double.cpp index 2ac083d2..3fd616a4 100644 --- a/test_conformance/math_brute_force/unary_two_results_i.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp @@ -21,91 +21,6 @@ #include #include -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float", - sizeNames[vectorSize], - "* out, __global int", - sizeNames[vectorSize], - "* out2, __global float", - sizeNames[vectorSize], - "* in )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i], out2 + i );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float* out, __global int* out2, __global float* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " int3 iout = INT_MIN;\n" - " f0 = ", - name, - "( f0, &iout );\n" - " vstore3( f0, 0, out + 3*i );\n" - " vstore3( iout, 0, out2 + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " int3 iout = INT_MIN;\n" - " float3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0, &iout );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " out2[3*i+1] = iout.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " out2[3*i] = iout.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { @@ -202,15 +117,6 @@ typedef struct BuildKernelInfo bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. } BuildKernelInfo; -static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -226,367 +132,6 @@ static cl_ulong abs_cl_long(cl_long i) return (i ^ mask) - mask; } -int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - float maxError = 0.0f; - int64_t maxError2 = 0; - int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - float maxErrorVal = 0.0f; - float maxErrorVal2 = 0.0f; - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(float), bufferSize); - int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1); - cl_ulong maxiError; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - float float_ulps; - if (gIsEmbedded) - float_ulps = f->float_embedded_ulps; - else - float_ulps = f->float_ulps; - - maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0; - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_FloatFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - uint32_t *p = (uint32_t *)gIn; - if (gWimpyMode) - { - for (j = 0; j < bufferSize / sizeof(float); j++) - p[j] = (uint32_t)i + j * scale; - } - else - { - for (j = 0; j < bufferSize / sizeof(float); j++) - p[j] = (uint32_t)i + j; - } - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - - memset_pattern4(gOut2[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeValues[j] * sizeof(cl_float); - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - float *r = (float *)gOut_Ref; - int *r2 = (int *)gOut_Ref2; - float *s = (float *)gIn; - for (j = 0; j < bufferSize / sizeof(float); j++) - r[j] = (float)f->func.f_fpI(s[j], r2 + j); - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, - bufferSize, gOut2[j], 0, NULL, NULL))) - { - vlog_error("ReadArray2 failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data - uint32_t *t = (uint32_t *)gOut_Ref; - int32_t *t2 = (int32_t *)gOut_Ref2; - for (j = 0; j < bufferSize / sizeof(float); j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - uint32_t *q = (uint32_t *)(gOut[k]); - int32_t *q2 = (int32_t *)(gOut2[k]); - - // If we aren't getting the correctly rounded result - if (t[j] != q[j] || t2[j] != q2[j]) - { - float test = ((float *)q)[j]; - int correct2 = INT_MIN; - double correct = f->func.f_fpI(s[j], &correct2); - float err = Ulp_Error(test, correct); - cl_long iErr = (int64_t)q2[j] - (int64_t)correct2; - int fail = !(fabsf(err) <= float_ulps - && abs_cl_long(iErr) <= maxiError); - if (ftz) - { - // retry per section 6.5.3.2 - if (IsFloatResultSubnormal(correct, float_ulps)) - { - fail = fail && !(test == 0.0f && iErr == 0); - if (!fail) err = 0.0f; - } - - // retry per section 6.5.3.3 - if (IsFloatSubnormal(s[j])) - { - int correct5, correct6; - double correct3 = f->func.f_fpI(0.0, &correct5); - double correct4 = f->func.f_fpI(-0.0, &correct6); - float err2 = Ulp_Error(test, correct3); - float err3 = Ulp_Error(test, correct4); - cl_long iErr2 = - (long long)q2[j] - (long long)correct5; - cl_long iErr3 = - (long long)q2[j] - (long long)correct6; - - // Did +0 work? - if (fabsf(err2) <= float_ulps - && abs_cl_long(iErr2) <= maxiError) - { - err = err2; - iErr = iErr2; - fail = 0; - } - // Did -0 work? - else if (fabsf(err3) <= float_ulps - && abs_cl_long(iErr3) <= maxiError) - { - err = err3; - iErr = iErr3; - fail = 0; - } - - // retry per section 6.5.3.4 - if (fail - && (IsFloatResultSubnormal(correct2, float_ulps) - || IsFloatResultSubnormal(correct3, - float_ulps))) - { - fail = fail - && !(test == 0.0f - && (abs_cl_long(iErr2) <= maxiError - || abs_cl_long(iErr3) - <= maxiError)); - if (!fail) - { - err = 0.0f; - iErr = 0; - } - } - } - } - if (fabsf(err) > maxError) - { - maxError = fabsf(err); - maxErrorVal = s[j]; - } - if (llabs(iErr) > maxError2) - { - maxError2 = llabs(iErr); - maxErrorVal2 = s[j]; - } - - if (fail) - { - vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: " - "*{%a, %d} vs. {%a, %d}\n", - f->name, sizeNames[k], err, (int)iErr, - ((float *)gIn)[j], ((float *)gOut_Ref)[j], - ((int *)gOut_Ref2)[j], test, q2[j]); - error = -1; - goto exit; - } - } - } - } - - if (0 == (i & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); - } - else - { - vlog("."); - } - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - uint32_t *p = (uint32_t *)gIn; - for (j = 0; j < bufferSize / sizeof(float); j++) - p[j] = genrand_int32(d); - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeValues[j] * sizeof(cl_float); - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), - &gOutBuffer2[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(float)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", - f->name, sizeNames[j]); - } - } - - if (!gSkipCorrectnessTesting) - vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); - vlog("\n"); - -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -} - int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) { uint64_t i; diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp new file mode 100644 index 00000000..82bbb81b --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp @@ -0,0 +1,492 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float", + sizeNames[vectorSize], + "* out, __global int", + sizeNames[vectorSize], + "* out2, __global float", + sizeNames[vectorSize], + "* in )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i], out2 + i );\n" + "}\n" }; + + const char *c3[] = { + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global float* out, __global int* out2, __global float* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " float3 f0 = vload3( 0, in + 3 * i );\n" + " int3 iout = INT_MIN;\n" + " f0 = ", + name, + "( f0, &iout );\n" + " vstore3( f0, 0, out + 3*i );\n" + " vstore3( iout, 0, out2 + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " int3 iout = INT_MIN;\n" + " float3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (float3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0, &iout );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " out2[3*i+1] = iout.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " out2[3*i] = iout.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernel(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +static cl_ulong abs_cl_long(cl_long i) +{ + cl_long mask = i >> 63; + return (i ^ mask) - mask; +} + +int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + int64_t maxError2 = 0; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(float), bufferSize); + int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1); + cl_ulong maxiError; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + float float_ulps; + if (gIsEmbedded) + float_ulps = f->float_embedded_ulps; + else + float_ulps = f->float_ulps; + + maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0; + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_FloatFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + uint32_t *p = (uint32_t *)gIn; + if (gWimpyMode) + { + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j * scale; + } + else + { + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = (uint32_t)i + j; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + + memset_pattern4(gOut2[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_float); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + float *r = (float *)gOut_Ref; + int *r2 = (int *)gOut_Ref2; + float *s = (float *)gIn; + for (j = 0; j < bufferSize / sizeof(float); j++) + r[j] = (float)f->func.f_fpI(s[j], r2 + j); + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + int32_t *t2 = (int32_t *)gOut_Ref2; + for (j = 0; j < bufferSize / sizeof(float); j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint32_t *q = (uint32_t *)(gOut[k]); + int32_t *q2 = (int32_t *)(gOut2[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j] || t2[j] != q2[j]) + { + float test = ((float *)q)[j]; + int correct2 = INT_MIN; + double correct = f->func.f_fpI(s[j], &correct2); + float err = Ulp_Error(test, correct); + cl_long iErr = (int64_t)q2[j] - (int64_t)correct2; + int fail = !(fabsf(err) <= float_ulps + && abs_cl_long(iErr) <= maxiError); + if (ftz) + { + // retry per section 6.5.3.2 + if (IsFloatResultSubnormal(correct, float_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsFloatSubnormal(s[j])) + { + int correct5, correct6; + double correct3 = f->func.f_fpI(0.0, &correct5); + double correct4 = f->func.f_fpI(-0.0, &correct6); + float err2 = Ulp_Error(test, correct3); + float err3 = Ulp_Error(test, correct4); + cl_long iErr2 = + (long long)q2[j] - (long long)correct5; + cl_long iErr3 = + (long long)q2[j] - (long long)correct6; + + // Did +0 work? + if (fabsf(err2) <= float_ulps + && abs_cl_long(iErr2) <= maxiError) + { + err = err2; + iErr = iErr2; + fail = 0; + } + // Did -0 work? + else if (fabsf(err3) <= float_ulps + && abs_cl_long(iErr3) <= maxiError) + { + err = err3; + iErr = iErr3; + fail = 0; + } + + // retry per section 6.5.3.4 + if (fail + && (IsFloatResultSubnormal(correct2, float_ulps) + || IsFloatResultSubnormal(correct3, + float_ulps))) + { + fail = fail + && !(test == 0.0f + && (abs_cl_long(iErr2) <= maxiError + || abs_cl_long(iErr3) + <= maxiError)); + if (!fail) + { + err = 0.0f; + iErr = 0; + } + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = s[j]; + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = s[j]; + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: " + "*{%a, %d} vs. {%a, %d}\n", + f->name, sizeNames[k], err, (int)iErr, + ((float *)gIn)[j], ((float *)gOut_Ref)[j], + ((int *)gOut_Ref2)[j], test, q2[j]); + error = -1; + goto exit; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + uint32_t *p = (uint32_t *)gIn; + for (j = 0; j < bufferSize / sizeof(float); j++) + p[j] = genrand_int32(d); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_float); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]), + &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(float)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", + f->name, sizeNames[j]); + } + } + + if (!gSkipCorrectnessTesting) + vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp new file mode 100644 index 00000000..d3b92186 --- /dev/null +++ b/test_conformance/math_brute_force/unary_u_double.cpp @@ -0,0 +1,385 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double", + sizeNames[vectorSize], + "* out, __global ulong", + sizeNames[vectorSize], + "* in )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global double* out, __global ulong* in )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " ulong3 u0 = vload3( 0, in + 3 * i );\n" + " double3 f0 = ", + name, + "( u0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " ulong3 u0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, " + "0xdeaddeaddeaddeadUL ); \n" + " break;\n" + " case 0:\n" + " u0 = (ulong3)( in[3*i], in[3*i+1], " + "0xdeaddeaddeaddeadUL ); \n" + " break;\n" + " }\n" + " double3 f0 = ", + name, + "( u0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. +} BuildKernelInfo; + +static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelDouble(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +static cl_ulong random64(MTdata d) +{ + return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32); +} + +int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ; + double maxErrorVal = 0.0f; + size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(cl_double), bufferSize); + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + Force64BitFPUPrecision(); + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode, relaxedMode }; + if ((error = ThreadPool_Do(BuildKernel_DoubleFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_ulong *p = (cl_ulong *)gIn; + for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d); + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_double); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + double *r = (double *)gOut_Ref; + cl_ulong *s = (cl_ulong *)gIn; + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + r[j] = (double)f->dfunc.f_u(s[j]); + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint64_t *t = (uint64_t *)gOut_Ref; + for (j = 0; j < bufferSize / sizeof(cl_double); j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint64_t *q = (uint64_t *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double test = ((double *)q)[j]; + long double correct = f->dfunc.f_u(s[j]); + float err = Bruteforce_Ulp_Error_Double(test, correct); + int fail = !(fabsf(err) <= f->double_ulps); + + // half_sin/cos/tan are only valid between +-2**16, Inf, NaN + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsDoubleResultSubnormal(correct, + f->double_ulps)) + { + fail = fail && (test != 0.0); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = s[j]; + } + if (fail) + { + vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: " + "*%.13la vs. %.13la\n", + f->name, sizeNames[k], err, + ((uint64_t *)gIn)[j], + ((double *)gOut_Ref)[j], test); + error = -1; + goto exit; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (gMeasureTimes) + { + // Init input array + double *p = (double *)gIn; + + for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d); + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_double); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + double sum = 0.0; + double bestTime = INFINITY; + for (k = 0; k < PERF_LOOP_COUNT; k++) + { + uint64_t startTime = GetTime(); + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, + NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + + // Make sure OpenCL is done + if ((error = clFinish(gQueue))) + { + vlog_error("Error %d at clFinish\n", error); + goto exit; + } + + uint64_t endTime = GetTime(); + double time = SubtractTime(endTime, startTime); + sum += time; + if (time < bestTime) bestTime = time; + } + + if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; + double clocksPerOp = bestTime * (double)gDeviceFrequency + * gComputeDevices * gSimdSize * 1e6 + / (bufferSize / sizeof(double)); + vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", + f->name, sizeNames[j]); + } + for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u_float.cpp similarity index 54% rename from test_conformance/math_brute_force/unary_u.cpp rename to test_conformance/math_brute_force/unary_u_float.cpp index 3b8f1f69..74b3b760 100644 --- a/test_conformance/math_brute_force/unary_u.cpp +++ b/test_conformance/math_brute_force/unary_u_float.cpp @@ -99,88 +99,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global ulong", - sizeNames[vectorSize], - "* in )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global ulong* in )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " ulong3 u0 = vload3( 0, in + 3 * i );\n" - " double3 f0 = ", - name, - "( u0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " ulong3 u0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, " - "0xdeaddeaddeaddeadUL ); \n" - " break;\n" - " case 0:\n" - " u0 = (ulong3)( in[3*i], in[3*i+1], " - "0xdeaddeaddeaddeadUL ); \n" - " break;\n" - " }\n" - " double3 f0 = ", - name, - "( u0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - typedef struct BuildKernelInfo { cl_uint offset; // the first vector size to build @@ -199,15 +117,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED, info->programs + i, info->relaxedMode); } -static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelDouble(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode) { uint64_t i; @@ -514,267 +423,3 @@ exit: return error; } - -static cl_ulong random64(MTdata d) -{ - return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32); -} - -int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) -{ - uint64_t i; - uint32_t j, k; - int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; - float maxError = 0.0f; - int ftz = f->ftz || gForceFTZ; - double maxErrorVal = 0.0f; - size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE; - uint64_t step = getTestStep(sizeof(cl_double), bufferSize); - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - Force64BitFPUPrecision(); - - // Init the kernels - { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; - if ((error = ThreadPool_Do(BuildKernel_DoubleFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - return error; - } - - for (i = 0; i < (1ULL << 32); i += step) - { - // Init input array - cl_ulong *p = (cl_ulong *)gIn; - for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d); - - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - goto exit; - } - } - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeValues[j] * sizeof(cl_double); - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - } - - // Get that moving - if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); - - // Calculate the correctly rounded reference result - double *r = (double *)gOut_Ref; - cl_ulong *s = (cl_ulong *)gIn; - for (j = 0; j < bufferSize / sizeof(cl_double); j++) - r[j] = (double)f->dfunc.f_u(s[j]); - - // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) - { - vlog_error("ReadArray failed %d\n", error); - goto exit; - } - } - - if (gSkipCorrectnessTesting) break; - - // Verify data - uint64_t *t = (uint64_t *)gOut_Ref; - for (j = 0; j < bufferSize / sizeof(cl_double); j++) - { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - uint64_t *q = (uint64_t *)(gOut[k]); - - // If we aren't getting the correctly rounded result - if (t[j] != q[j]) - { - double test = ((double *)q)[j]; - long double correct = f->dfunc.f_u(s[j]); - float err = Bruteforce_Ulp_Error_Double(test, correct); - int fail = !(fabsf(err) <= f->double_ulps); - - // half_sin/cos/tan are only valid between +-2**16, Inf, NaN - if (fail) - { - if (ftz) - { - // retry per section 6.5.3.2 - if (IsDoubleResultSubnormal(correct, - f->double_ulps)) - { - fail = fail && (test != 0.0); - if (!fail) err = 0.0f; - } - } - } - if (fabsf(err) > maxError) - { - maxError = fabsf(err); - maxErrorVal = s[j]; - } - if (fail) - { - vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: " - "*%.13la vs. %.13la\n", - f->name, sizeNames[k], err, - ((uint64_t *)gIn)[j], - ((double *)gOut_Ref)[j], test); - error = -1; - goto exit; - } - } - } - } - - if (0 == (i & 0x0fffffff)) - { - if (gVerboseBruteForce) - { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); - } - else - { - vlog("."); - } - fflush(stdout); - } - } - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (gMeasureTimes) - { - // Init input array - double *p = (double *)gIn; - - for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d); - if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) - { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); - return error; - } - - - // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - size_t vectorSize = sizeValues[j] * sizeof(cl_double); - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) - { - LogBuildError(programs[j]); - goto exit; - } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) - { - LogBuildError(programs[j]); - goto exit; - } - - double sum = 0.0; - double bestTime = INFINITY; - for (k = 0; k < PERF_LOOP_COUNT; k++) - { - uint64_t startTime = GetTime(); - if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, - NULL))) - { - vlog_error("FAILED -- could not execute kernel\n"); - goto exit; - } - - // Make sure OpenCL is done - if ((error = clFinish(gQueue))) - { - vlog_error("Error %d at clFinish\n", error); - goto exit; - } - - uint64_t endTime = GetTime(); - double time = SubtractTime(endTime, startTime); - sum += time; - if (time < bestTime) bestTime = time; - } - - if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT; - double clocksPerOp = bestTime * (double)gDeviceFrequency - * gComputeDevices * gSimdSize * 1e6 - / (bufferSize / sizeof(double)); - vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", - f->name, sizeNames[j]); - } - for (; j < gMaxVectorSizeIndex; j++) vlog("\t -- "); - } - - if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); - vlog("\n"); - -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } - - return error; -}