Files
OpenCL-CTS/test_conformance/half/Test_vLoadHalf.c
Kevin Petit d8733efc0f Synchronise with Khronos-private Gitlab branch
The maintenance of the conformance tests is moving to Github.

This commit contains all the changes that have been done in
Gitlab since the first public release of the conformance tests.

Signed-off-by: Kevin Petit <kevin.petit@arm.com>
2019-03-05 16:23:49 +00:00

628 lines
26 KiB
C

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include <string.h>
#include <math.h>
#include "cl_utils.h"
#include "tests.h"
extern const char *addressSpaceNames[];
static inline float half2float( cl_ushort us )
{
uint32_t u = us;
uint32_t sign = (u << 16) & 0x80000000;
int32_t exponent = (u & 0x7c00) >> 10;
uint32_t mantissa = (u & 0x03ff) << 13;
union{ unsigned int u; float f;}uu;
if( exponent == 0 )
{
if( mantissa == 0 )
return sign ? -0.0f : 0.0f;
int shift = __builtin_clz( mantissa ) - 8;
exponent -= shift-1;
mantissa <<= shift;
mantissa &= 0x007fffff;
}
else
if( exponent == 31)
{
uu.u = mantissa | sign;
if( mantissa )
uu.u |= 0x7fc00000;
else
uu.u |= 0x7f800000;
return uu.f;
}
exponent += 127 - 15;
exponent <<= 23;
exponent |= mantissa;
uu.u = exponent | sign;
return uu.f;
}
int Test_vLoadHalf_private( bool aligned );
int Test_vLoadHalf_private( bool aligned )
{
cl_int error;
int vectorSize;
cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][4] = {{0}};
cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][4] = {{0}};
uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
size_t q;
memset( min_time, -1, sizeof( min_time ) );
vlog( "Testing vload%s_half\n", aligned ? "a" : "" );
fflush( stdout );
const char *vector_size_names[] = {"1", "2", "4", "8", "16", "3"};
int minVectorSize = kMinVectorSize;
// There is no aligned scalar vloada_half in CL 1.1
#if ! defined( CL_VERSION_1_1 ) && ! defined(__APPLE__)
vlog("Note: testing vloada_half.\n");
if (aligned && minVectorSize == 0)
minVectorSize = 1;
#endif
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
int effectiveVectorSize = g_arrVecSizes[vectorSize];
if(effectiveVectorSize == 3 && aligned) {
effectiveVectorSize = 4;
}
const char *source[] = {
"__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
"}\n"
};
const char *sourceV3[] = {
"__kernel void test( const __global half *p, __global float *f,\n"
" uint extra_last_thread)\n"
"{\n"
" size_t i = get_global_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" if(extra_last_thread ==2) {\n"
" f[3*i+1] = vload_half(3*i+1, p);\n"
" }\n"
" f[3*i] = vload_half(3*i, p);\n"
" } else {\n"
" vstore3(vload_half3( i, p ),i,f);\n"
" }\n"
"}\n"
};
const char *sourceV3aligned[] = {
"__kernel void test( const __global half *p, __global float3 *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" f[i] = vloada_half3( i, p );\n"
" ((__global float *)f)[4*i+3] = vloada_half(4*i+3,p);\n"
"}\n"
};
const char *source_private1[] = {
"__kernel void test( const __global half *p, __global float *f )\n"
"{\n"
" __private ushort data[1];\n"
" __private half* hdata_p = (__private half*) data;\n"
" size_t i = get_global_id(0);\n"
" data[0] = ((__global ushort*)p)[i];\n"
" f[i] = vload", (aligned ? "a" : ""), "_half( 0, hdata_p );\n"
"}\n"
};
const char *source_private2[] = {
"__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
"{\n"
" __private ", align_types[vectorSize], " data[", vector_size_names[vectorSize], "/", align_divisors[vectorSize], "];\n"
" __private half* hdata_p = (__private half*) data;\n"
" __global ", align_types[vectorSize], "* i_p = (__global ", align_types[vectorSize], "*)p;\n"
" size_t i = get_global_id(0);\n"
" int k;\n"
" for (k=0; k<",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"; k++)\n"
" data[k] = i_p[i+k];\n"
" f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( 0, hdata_p );\n"
"}\n"
};
const char *source_privateV3[] = {
"__kernel void test( const __global half *p, __global float *f,"
" uint extra_last_thread )\n"
"{\n"
" __private ushort data[3];\n"
" __private half* hdata_p = (__private half*) data;\n"
" __global ushort* i_p = (__global ushort*)p;\n"
" size_t i = get_global_id(0);\n"
" int k;\n"
// " data = vload3(i, i_p);\n"
" size_t last_i = get_global_size(0)-1;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" if(extra_last_thread ==2) {\n"
" f[3*i+1] = vload_half(3*i+1, p);\n"
" }\n"
" f[3*i] = vload_half(3*i, p);\n"
" } else {\n"
" for (k=0; k<3; k++)\n"
" data[k] = i_p[i*3+k];\n"
" vstore3(vload_half3( 0, hdata_p ), i, f);\n"
" }\n"
"}\n"
};
const char *source_privateV3aligned[] = {
"__kernel void test( const __global half *p, __global float3 *f )\n"
"{\n"
" ushort4 data[4];\n" // declare as vector for alignment. Make four to check to see vloada_half3 index is working.
" half* hdata_p = (half*) &data;\n"
" size_t i = get_global_id(0);\n"
" global ushort* i_p = (global ushort*)p + i * 4;\n"
" int offset = i & 3;\n"
" data[offset] = (ushort4)( i_p[0], i_p[1], i_p[2], USHRT_MAX ); \n"
" data[offset^1] = USHRT_MAX; \n"
" data[offset^2] = USHRT_MAX; \n"
" data[offset^3] = USHRT_MAX; \n"
// test vloada_half3
" f[i] = vloada_half3( offset, hdata_p );\n"
// Fill in the 4th value so we don't have to special case this code elsewhere in the test.
" mem_fence(CLK_GLOBAL_MEM_FENCE );\n"
" ((__global float *)f)[4*i+3] = vload_half(4*i+3, p);\n"
"}\n"
};
char local_buf_size[10];
sprintf(local_buf_size, "%lld", (uint64_t)((effectiveVectorSize))*gWorkGroupSize);
const char *source_local1[] = {
"__kernel void test( const __global half *p, __global float *f )\n"
"{\n"
" __local ushort data[",local_buf_size,"];\n"
" __local half* hdata_p = (__local half*) data;\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" data[lid] = ((__global ushort*)p)[i];\n"
" f[i] = vload", aligned ? "a" : "", "_half( lid, hdata_p );\n"
"}\n"
};
const char *source_local2[] = {
"__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
"{\n"
" __local ", align_types[vectorSize], " data[", local_buf_size, "/", align_divisors[vectorSize], "];\n"
" __local half* hdata_p = (__local half*) data;\n"
" __global ", align_types[vectorSize], "* i_p = (__global ", align_types[vectorSize],"*)p;\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" int k;\n"
" for (k=0; k<",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"; k++)\n"
" data[lid*",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"+k] = i_p[i*",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"+k];\n"
" f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( lid, hdata_p );\n"
"}\n"
};
const char *source_localV3[] = {
"__kernel void test( const __global half *p, __global float *f,\n"
" uint extra_last_thread)\n"
"{\n"
" __local ushort data[", local_buf_size,"];\n"
" __local half* hdata_p = (__local half*) data;\n"
" __global ushort* i_p = (__global ushort*)p;\n"
" size_t i = get_global_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" size_t lid = get_local_id(0);\n"
" int k;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" if(extra_last_thread ==2) {\n"
" f[3*i+1] = vload_half(3*i+1, p);\n"
" }\n"
" f[3*i] = vload_half(3*i, p);\n"
" } else {\n"
" for (k=0; k<3; k++)\n"
" data[lid*3+k] = i_p[i*3+k];\n"
" vstore3( vload_half3( lid, hdata_p ),i,f);\n"
" };\n"
"}\n"
};
const char *source_localV3aligned[] = {
"__kernel void test( const __global half *p, __global float3 *f )\n"
"{\n"
" __local ushort data[", local_buf_size,"];\n"
" __local half* hdata_p = (__local half*) data;\n"
" __global ushort* i_p = (__global ushort*)p;\n"
" size_t i = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" int k;\n"
" for (k=0; k<4; k++)\n"
" data[lid*4+k] = i_p[i*4+k];\n"
" f[i] = vloada_half3( lid, hdata_p );\n"
" ((__global float *)f)[4*i+3] = vload_half(lid*4+3, hdata_p);\n"
"}\n"
};
const char *source_constant[] = {
"__kernel void test( __constant half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
"}\n"
};
const char *source_constantV3[] = {
"__kernel void test( __constant half *p, __global float *f,\n"
" uint extra_last_thread)\n"
"{\n"
" size_t i = get_global_id(0);\n"
" size_t last_i = get_global_size(0)-1;\n"
" if(last_i == i && extra_last_thread != 0) {\n"
" if(extra_last_thread ==2) {\n"
" f[3*i+1] = vload_half(3*i+1, p);\n"
" }\n"
" f[3*i] = vload_half(3*i, p);\n"
" } else {\n"
" vstore3(vload_half",vector_size_name_extensions[vectorSize],"( i, p ), i, f);\n"
" }\n"
"}\n"
};
const char *source_constantV3aligned[] = {
"__kernel void test( __constant half *p, __global float3 *f )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" f[i] = vloada_half3( i, p );\n"
" ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
"}\n"
};
if(g_arrVecSizes[vectorSize] != 3) {
programs[vectorSize][0] = MakeProgram( source, sizeof( source) / sizeof( source[0]) );
if( NULL == programs[ vectorSize ][0] ) {
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create program.\n" );
for ( q= 0; q < sizeof( source) / sizeof( source[0]); q++)
vlog_error("%s", source[q]);
return -1;
} else {
}
} else if(aligned) {
programs[vectorSize][0] = MakeProgram( sourceV3aligned, sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]) );
if( NULL == programs[ vectorSize ][0] ) {
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create program.\n" );
for ( q= 0; q < sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]); q++)
vlog_error("%s", sourceV3aligned[q]);
return -1;
} else {
}
} else {
programs[vectorSize][0] = MakeProgram( sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0]) );
if( NULL == programs[ vectorSize ][0] ) {
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create program.\n" );
for ( q= 0; q < sizeof( sourceV3) / sizeof( sourceV3[0]); q++)
vlog_error("%s", sourceV3[q]);
return -1;
}
}
kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
if( NULL == kernels[vectorSize][0] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
return -2;
}
const char** source_ptr;
uint32_t source_size;
if (vectorSize == 0) {
source_ptr = source_private1;
source_size = sizeof( source_private1) / sizeof( source_private1[0]);
} else if(g_arrVecSizes[vectorSize] == 3) {
if(aligned) {
source_ptr = source_privateV3aligned;
source_size = sizeof( source_privateV3aligned) / sizeof( source_privateV3aligned[0]);
} else {
source_ptr = source_privateV3;
source_size = sizeof( source_privateV3) / sizeof( source_privateV3[0]);
}
} else {
source_ptr = source_private2;
source_size = sizeof( source_private2) / sizeof( source_private2[0]);
}
programs[vectorSize][1] = MakeProgram( source_ptr, source_size );
if( NULL == programs[ vectorSize ][1] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create private program.\n" );
for ( q= 0; q < source_size; q++)
vlog_error("%s", source_ptr[q]);
return -1;
}
kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
if( NULL == kernels[vectorSize][1] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
return -2;
}
if (vectorSize == 0) {
source_ptr = source_local1;
source_size = sizeof( source_local1) / sizeof( source_local1[0]);
} else if(g_arrVecSizes[vectorSize] == 3) {
if(aligned) {
source_ptr = source_localV3aligned;
source_size = sizeof(source_localV3aligned)/sizeof(source_localV3aligned[0]);
} else {
source_ptr = source_localV3;
source_size = sizeof(source_localV3)/sizeof(source_localV3[0]);
}
} else {
source_ptr = source_local2;
source_size = sizeof( source_local2) / sizeof( source_local2[0]);
}
programs[vectorSize][2] = MakeProgram( source_ptr, source_size );
if( NULL == programs[ vectorSize ][2] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create local program.\n" );
for ( q= 0; q < source_size; q++)
vlog_error("%s", source_ptr[q]);
return -1;
}
kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
if( NULL == kernels[vectorSize][2] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
return -2;
}
if(g_arrVecSizes[vectorSize] == 3) {
if(aligned) {
programs[vectorSize][3] = MakeProgram( source_constantV3aligned, sizeof(source_constantV3aligned) / sizeof( source_constantV3aligned[0]) );
if( NULL == programs[ vectorSize ][3] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
for ( q= 0; q < sizeof( source_constantV3aligned) / sizeof( source_constantV3aligned[0]); q++)
vlog_error("%s", source_constantV3aligned[q]);
return -1;
}
} else {
programs[vectorSize][3] = MakeProgram( source_constantV3, sizeof(source_constantV3) / sizeof( source_constantV3[0]) );
if( NULL == programs[ vectorSize ][3] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
for ( q= 0; q < sizeof( source_constantV3) / sizeof( source_constantV3[0]); q++)
vlog_error("%s", source_constantV3[q]);
return -1;
}
}
} else {
programs[vectorSize][3] = MakeProgram( source_constant, sizeof(source_constant) / sizeof( source_constant[0]) );
if( NULL == programs[ vectorSize ][3] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
for ( q= 0; q < sizeof( source_constant) / sizeof( source_constant[0]); q++)
vlog_error("%s", source_constant[q]);
return -1;
}
}
kernels[ vectorSize ][3] = clCreateKernel( programs[ vectorSize ][3], "test", &error );
if( NULL == kernels[vectorSize][3] )
{
gFailCount++;
vlog_error( "\t\tFAILED -- Failed to create constant kernel. (%d)\n", error );
return -2;
}
}
// Figure out how many elements are in a work block
size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
size_t blockCount = getBufferSize(gDevice) / elementSize; // elementSize is power of 2
uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
// we handle 64-bit types a bit differently.
if( lastCase == 0 )
lastCase = 0x100000000ULL;
uint64_t i, j;
uint64_t printMask = (lastCase >> 4) - 1;
uint32_t count = 0;
error = 0;
int addressSpace;
// int reported_vector_skip = 0;
for( i = 0; i < (uint64_t)lastCase; i += blockCount )
{
count = (uint32_t) MIN( blockCount, lastCase - i );
//Init the input stream
uint16_t *p = (uint16_t *)gIn_half;
for( j = 0; j < count; j++ )
p[j] = j + i;
if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)))
{
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
//create the reference result
const unsigned short *s = (const unsigned short *)gIn_half;
float *d = (float *)gOut_single_reference;
for( j = 0; j < count; j++ )
d[j] = half2float( s[j] );
//Check the vector lengths
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{ // here we loop through vector sizes, 3 is last
for ( addressSpace = 0; addressSpace < 4; addressSpace++) {
uint32_t pattern = 0x7fffdead;
/*
if (addressSpace == 3) {
vlog("Note: skipping address space %s due to small buffer size.\n", addressSpaceNames[addressSpace]);
continue;
}
*/
memset_pattern4( gOut_single, &pattern, getBufferSize(gDevice));
if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
{
vlog_error( "Failure in clWriteArray\n" );
gFailCount++;
goto exit;
}
if(g_arrVecSizes[vectorSize] == 3 && !aligned) {
// now we need to add the extra const argument for how
// many elements the last thread should take care of.
}
// okay, here is where we have to be careful
if( (error = RunKernel( kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
runsOverBy(count, vectorSize, aligned) ) ) )
{
gFailCount++;
goto exit;
}
if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
{
vlog_error( "Failure in clReadArray\n" );
gFailCount++;
goto exit;
}
if( memcmp( gOut_single, gOut_single_reference, count * sizeof( float )) )
{
uint32_t *u1 = (uint32_t *)gOut_single;
uint32_t *u2 = (uint32_t *)gOut_single_reference;
float *f1 = (float *)gOut_single;
float *f2 = (float *)gOut_single_reference;
for( j = 0; j < count; j++ )
{
if(isnan(f1[j]) && isnan(f2[j])) // both are nan dont compare them
continue;
if( u1[j] != u2[j])
{
vlog_error( " %lld) (of %lld) Failure at 0x%4.4x: %a vs *%a (0x%8.8x vs *0x%8.8x) vector_size = %d (%s) address space = %s, load is %s\n",
j, (uint64_t)count, ((unsigned short*)gIn_half)[j], f1[j], f2[j], u1[j], u2[j], (g_arrVecSizes[vectorSize]),
vector_size_names[vectorSize], addressSpaceNames[addressSpace],
(aligned?"aligned":"unaligned"));
gFailCount++;
break; // goto exit;
}
}
}
if( gReportTimes && addressSpace == 0)
{
//Run again for timing
for( j = 0; j < 100; j++ )
{
uint64_t startTime = ReadTime();
error =
RunKernel( kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
runsOverBy(count, vectorSize, aligned));
if(error)
{
gFailCount++;
goto exit;
}
if( (error = clFinish(gQueue)) )
{
vlog_error( "Failure in clFinish\n" );
gFailCount++;
goto exit;
}
uint64_t currentTime = ReadTime() - startTime;
time[ vectorSize ] += currentTime;
if( currentTime < min_time[ vectorSize ] )
min_time[ vectorSize ] = currentTime ;
}
}
}
}
if( ((i+blockCount) & ~printMask) == (i+blockCount) )
{
vlog( "." );
fflush( stdout );
}
}
vlog( "\tPassed\n" );
if( gReportTimes )
{
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * 100), 0,
"average us/elem", "vLoad%sHalf avg. (%s, vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
"best us/elem", "vLoad%sHalf best (%s vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
}
exit:
//clean up
for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
{
for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
clReleaseKernel( kernels[ vectorSize ][addressSpace] );
clReleaseProgram( programs[ vectorSize ][addressSpace] );
}
}
gTestCount++;
return error;
}
int Test_vload_half( void )
{
return Test_vLoadHalf_private( false );
}
int Test_vloada_half( void )
{
return Test_vLoadHalf_private( true );
}