diff --git a/test_conformance/half/Test_vStoreHalf.cpp b/test_conformance/half/Test_vStoreHalf.cpp index 591470f0..efaceaf7 100644 --- a/test_conformance/half/Test_vStoreHalf.cpp +++ b/test_conformance/half/Test_vStoreHalf.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -70,8 +70,7 @@ typedef struct CheckResultInfoD_ int vsz; } CheckResultInfoD; -static cl_int -ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) { ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; cl_uint lim = cri->lim; @@ -83,10 +82,10 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) cl_ulong i = cri->i + off; cl_uint j; - if (off + count > lim) - count = lim - off; + if (off + count > lim) count = lim - off; - for (j = 0; j < count; ++j) { + for (j = 0; j < count; ++j) + { x[j] = as_float((cl_uint)(i + j)); r[j] = f(x[j]); } @@ -94,8 +93,7 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) return 0; } -static cl_int -CheckF(cl_uint jid, cl_uint tid, void *userInfo) +static cl_int CheckF(cl_uint jid, cl_uint tid, void *userInfo) { CheckResultInfoF *cri = (CheckResultInfoF *)userInfo; cl_uint lim = cri->lim; @@ -106,33 +104,33 @@ CheckF(cl_uint jid, cl_uint tid, void *userInfo) const cl_ushort *s = cri->s + off; f2h f = cri->f; cl_uint j; - cl_ushort correct2 = f( 0.0f); + cl_ushort correct2 = f(0.0f); cl_ushort correct3 = f(-0.0f); cl_int ret = 0; - if (off + count > lim) - count = lim - off; + if (off + count > lim) count = lim - off; - if (!memcmp(r, s, count*sizeof(cl_ushort))) - return 0; + if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0; - for (j = 0; j < count; j++) { + for (j = 0; j < count; j++) + { if (s[j] == r[j]) continue; // Pass any NaNs - if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 ) - continue; + if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue; // retry per section 6.5.3.3 if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3)) continue; // if reference result is subnormal, pass any zero - if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000)) + if (gIsEmbedded && IsHalfSubnormal(r[j]) + && (s[j] == 0x0000 || s[j] == 0x8000)) continue; - vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, vector_size = %d, address_space = %s\n", - j+off, x[j], r[j], s[j], cri->vsz, cri->aspace); + vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, " + "vector_size = %d, address_space = %s\n", + j + off, x[j], r[j], s[j], cri->vsz, cri->aspace); ret = 1; break; @@ -141,8 +139,7 @@ CheckF(cl_uint jid, cl_uint tid, void *userInfo) return ret; } -static cl_int -ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) +static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) { ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo; cl_uint lim = cri->lim; @@ -154,10 +151,10 @@ ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) cl_uint j; cl_ulong i = cri->i + off; - if (off + count > lim) - count = lim - off; + if (off + count > lim) count = lim - off; - for (j = 0; j < count; ++j) { + for (j = 0; j < count; ++j) + { x[j] = as_double(DoubleFromUInt((cl_uint)(i + j))); r[j] = f(x[j]); } @@ -165,8 +162,7 @@ ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) return 0; } -static cl_int -CheckD(cl_uint jid, cl_uint tid, void *userInfo) +static cl_int CheckD(cl_uint jid, cl_uint tid, void *userInfo) { CheckResultInfoD *cri = (CheckResultInfoD *)userInfo; cl_uint lim = cri->lim; @@ -177,35 +173,35 @@ CheckD(cl_uint jid, cl_uint tid, void *userInfo) const cl_ushort *s = cri->s + off; d2h f = cri->f; cl_uint j; - cl_ushort correct2 = f( 0.0); + cl_ushort correct2 = f(0.0); cl_ushort correct3 = f(-0.0); cl_int ret = 0; - if (off + count > lim) - count = lim - off; + if (off + count > lim) count = lim - off; - if (!memcmp(r, s, count*sizeof(cl_ushort))) - return 0; + if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0; - for (j = 0; j < count; j++) { + for (j = 0; j < count; j++) + { if (s[j] == r[j]) continue; // Pass any NaNs - if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) - continue; + if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue; if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3)) continue; // if reference result is subnormal, pass any zero result - if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000)) + if (gIsEmbedded && IsHalfSubnormal(r[j]) + && (s[j] == 0x0000 || s[j] == 0x8000)) continue; - vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n", - j+off, x[j], r[j], s[j], cri->vsz, cri->aspace); + vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, " + "vector_size = %d, address space = %s (double precision)\n", + j + off, x[j], r[j], s[j], cri->vsz, cri->aspace); ret = 1; - break; + break; } return ret; @@ -251,100 +247,129 @@ static cl_half double2half_rtn(double f) return cl_half_from_double(f, CL_HALF_RTN); } -int test_vstore_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstore_half(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { switch (get_default_rounding_mode(deviceID)) { case CL_FP_ROUND_TO_ZERO: - return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rte, ""); - case 0: - return -1; + return Test_vStoreHalf_private(deviceID, float2half_rtz, + double2half_rte, ""); + case 0: return -1; default: - return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, ""); + return Test_vStoreHalf_private(deviceID, float2half_rte, + double2half_rte, ""); } } -int test_vstore_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstore_half_rte(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "_rte"); + return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, + "_rte"); } -int test_vstore_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstore_half_rtz(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz"); + return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz, + "_rtz"); } -int test_vstore_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstore_half_rtp(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp"); + return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp, + "_rtp"); } -int test_vstore_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstore_half_rtn(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn"); + return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn, + "_rtn"); } -int test_vstorea_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstorea_half(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { switch (get_default_rounding_mode(deviceID)) { case CL_FP_ROUND_TO_ZERO: - return Test_vStoreaHalf_private(deviceID,float2half_rtz, double2half_rte, ""); - case 0: - return -1; + return Test_vStoreaHalf_private(deviceID, float2half_rtz, + double2half_rte, ""); + case 0: return -1; default: - return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, ""); + return Test_vStoreaHalf_private(deviceID, float2half_rte, + double2half_rte, ""); } } -int test_vstorea_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstorea_half_rte(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "_rte"); + return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, + "_rte"); } -int test_vstorea_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstorea_half_rtz(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz"); + return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz, + "_rtz"); } -int test_vstorea_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstorea_half_rtp(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp"); + return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp, + "_rtp"); } -int test_vstorea_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_vstorea_half_rtn(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn"); + return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn, + "_rtn"); } #pragma mark - -int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName ) +int Test_vStoreHalf_private(cl_device_id device, f2h referenceFunc, + d2h doubleReferenceFunc, const char *roundName) { int vectorSize, error; - cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3]; - cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3]; + cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3]; + cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; + cl_program resetProgram; + cl_kernel resetKernel; - uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - memset( min_time, -1, sizeof( min_time ) ); - cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3]; - cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3]; - uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - memset( min_double_time, -1, sizeof( min_double_time ) ); + uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; + uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; + memset(min_time, -1, sizeof(min_time)); + cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3]; + cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; + uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; + uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = { + 0 + }; + memset(min_double_time, -1, sizeof(min_double_time)); - bool aligned= false; + bool aligned = false; - for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) + for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) { - const char *source[] = { - "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n" - "}\n" - }; + const char *source[] = { "__kernel void test( __global float", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " vstore_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], i, f );\n" + "}\n" }; const char *source_v3[] = { "__kernel void test( __global float *p, __global half *f,\n" @@ -356,21 +381,29 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR " if(last_i == i && extra_last_thread != 0) {\n" " adjust = 3-extra_last_thread;\n" " } " - " vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n" + " vstore_half3", + roundName, + "( vload3(i, p-adjust), i, f-adjust );\n" "}\n" }; const char *source_private_store[] = { - "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global float", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" " __private ushort data[16];\n" " size_t i = get_global_id(0);\n" " size_t offset = 0;\n" " size_t vecsize = vec_step(p[i]);\n" - " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n" + " vstore_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], 0, (__private half *)(&data[0]) );\n" " for(offset = 0; offset < vecsize; offset++)\n" " {\n" - " vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n" + " vstore_half(vload_half(offset, (__private half *)data), 0, " + "&f[vecsize*i+offset]);\n" " }\n" "}\n" }; @@ -388,10 +421,13 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR " if(last_i == i && extra_last_thread != 0) {\n" " adjust = 3-extra_last_thread;\n" " } " - " vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n" + " vstore_half3", + roundName, + "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n" " for(offset = 0; offset < 3; offset++)\n" " {\n" - " vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n" + " vstore_half(vload_half(offset, (__private half *) data), " + "0, &f[3*i+offset-adjust]);\n" " }\n" "}\n" }; @@ -401,17 +437,26 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR const char *source_local_store[] = { - "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global float", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __local ushort data[16*", local_buf_size, "];\n" + " __local ushort data[16*", + local_buf_size, + "];\n" " size_t i = get_global_id(0);\n" " size_t lid = get_local_id(0);\n" " size_t lsize = get_local_size(0);\n" " size_t vecsize = vec_step(p[0]);\n" " event_t async_event;\n" - " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n" + " vstore_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], lid, (__local half *)(&data[0]) );\n" " barrier( CLK_LOCAL_MEM_FENCE ); \n" - " async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later + " async_event = async_work_group_copy((__global ushort " + "*)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, " + "0);\n" // investigate later " wait_group_events(1, &async_event);\n" "}\n" }; @@ -450,25 +495,36 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR const char *double_source[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" " size_t i = get_global_id(0);\n" - " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n" + " vstore_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], i, f );\n" "}\n" }; const char *double_source_private_store[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" " __private ushort data[16];\n" " size_t i = get_global_id(0);\n" " size_t offset = 0;\n" " size_t vecsize = vec_step(p[i]);\n" - " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n" + " vstore_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], 0, (__private half *)(&data[0]) );\n" " for(offset = 0; offset < vecsize; offset++)\n" " {\n" - " vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n" + " vstore_half(vload_half(offset, (__private half *)data), 0, " + "&f[vecsize*i+offset]);\n" " }\n" "}\n" }; @@ -476,17 +532,26 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR const char *double_source_local_store[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __local ushort data[16*", local_buf_size, "];\n" + " __local ushort data[16*", + local_buf_size, + "];\n" " size_t i = get_global_id(0);\n" " size_t lid = get_local_id(0);\n" " size_t vecsize = vec_step(p[0]);\n" " size_t lsize = get_local_size(0);\n" " event_t async_event;\n" - " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n" + " vstore_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], lid, (__local half *)(&data[0]) );\n" " barrier( CLK_LOCAL_MEM_FENCE ); \n" - " async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later + " async_event = async_work_group_copy((__global ushort " + "*)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), " + "vecsize*lsize, 0);\n" // investigate later " wait_group_events(1, &async_event);\n" "}\n" }; @@ -503,7 +568,9 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR " if(last_i == i && extra_last_thread != 0) {\n" " adjust = 3-extra_last_thread;\n" " } " - " vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n" + " vstore_half3", + roundName, + "( vload3(i,p-adjust), i, f -adjust);\n" "}\n" }; @@ -520,10 +587,13 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR " if(last_i == i && extra_last_thread != 0) {\n" " adjust = 3-extra_last_thread;\n" " } " - " vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n" + " vstore_half3", + roundName, + "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n" " for(offset = 0; offset < 3; offset++)\n" " {\n" - " vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n" + " vstore_half(vload_half(offset, (__private half *)data), 0, " + "&f[3*i+offset-adjust]);\n" " }\n" "}\n" }; @@ -562,151 +632,235 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR }; - if(g_arrVecSizes[vectorSize] == 3) { - programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) ); - } else { - programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) ); + if (g_arrVecSizes[vectorSize] == 3) + { + programs[vectorSize][0] = MakeProgram( + device, source_v3, sizeof(source_v3) / sizeof(source_v3[0])); } - if( NULL == programs[ vectorSize ][0] ) + else + { + programs[vectorSize][0] = + MakeProgram(device, source, sizeof(source) / sizeof(source[0])); + } + if (NULL == programs[vectorSize][0]) { gFailCount++; return -1; } - kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error ); - if( NULL == kernels[vectorSize][0] ) + kernels[vectorSize][0] = + clCreateKernel(programs[vectorSize][0], "test", &error); + if (NULL == kernels[vectorSize][0]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error); return error; } - if(g_arrVecSizes[vectorSize] == 3) { - programs[vectorSize][1] = MakeProgram( device, source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) ); - } else { - programs[vectorSize][1] = MakeProgram( device, source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) ); + if (g_arrVecSizes[vectorSize] == 3) + { + programs[vectorSize][1] = + MakeProgram(device, source_private_store_v3, + sizeof(source_private_store_v3) + / sizeof(source_private_store_v3[0])); } - if( NULL == programs[ vectorSize ][1] ) + else + { + programs[vectorSize][1] = MakeProgram( + device, source_private_store, + sizeof(source_private_store) / sizeof(source_private_store[0])); + } + if (NULL == programs[vectorSize][1]) { gFailCount++; return -1; } - kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error ); - if( NULL == kernels[vectorSize][1] ) + kernels[vectorSize][1] = + clCreateKernel(programs[vectorSize][1], "test", &error); + if (NULL == kernels[vectorSize][1]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n", + error); return error; } - if(g_arrVecSizes[vectorSize] == 3) { - programs[vectorSize][2] = MakeProgram( device, source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) ); - if( NULL == programs[ vectorSize ][2] ) + if (g_arrVecSizes[vectorSize] == 3) + { + programs[vectorSize][2] = + MakeProgram(device, source_local_store_v3, + sizeof(source_local_store_v3) + / sizeof(source_local_store_v3[0])); + if (NULL == programs[vectorSize][2]) { unsigned q; - for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++) + for (q = 0; q < sizeof(source_local_store_v3) + / sizeof(source_local_store_v3[0]); + q++) vlog_error("%s", source_local_store_v3[q]); gFailCount++; return -1; - } - } else { - programs[vectorSize][2] = MakeProgram( device, source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) ); - if( NULL == programs[ vectorSize ][2] ) + } + else + { + programs[vectorSize][2] = MakeProgram( + device, source_local_store, + sizeof(source_local_store) / sizeof(source_local_store[0])); + if (NULL == programs[vectorSize][2]) { unsigned q; - for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++) + for (q = 0; q < sizeof(source_local_store) + / sizeof(source_local_store[0]); + q++) vlog_error("%s", source_local_store[q]); gFailCount++; return -1; - } } - kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error ); - if( NULL == kernels[vectorSize][2] ) + kernels[vectorSize][2] = + clCreateKernel(programs[vectorSize][2], "test", &error); + if (NULL == kernels[vectorSize][2]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n", + error); return error; } - if( gTestDouble ) + if (gTestDouble) { - if(g_arrVecSizes[vectorSize] == 3) { - doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) ); - } else { - doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) ); - } - if( NULL == doublePrograms[ vectorSize ][0] ) + if (g_arrVecSizes[vectorSize] == 3) { - gFailCount++; - return -1; + doublePrograms[vectorSize][0] = MakeProgram( + device, double_source_v3, + sizeof(double_source_v3) / sizeof(double_source_v3[0])); } - - doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error ); - if( NULL == kernels[vectorSize][0] ) - { - gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error ); - return error; - } - - if(g_arrVecSizes[vectorSize] == 3) - doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) ); else - doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) ); - - if( NULL == doublePrograms[ vectorSize ][1] ) + { + doublePrograms[vectorSize][0] = MakeProgram( + device, double_source, + sizeof(double_source) / sizeof(double_source[0])); + } + if (NULL == doublePrograms[vectorSize][0]) { gFailCount++; return -1; } - doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error ); - if( NULL == kernels[vectorSize][1] ) + doubleKernels[vectorSize][0] = + clCreateKernel(doublePrograms[vectorSize][0], "test", &error); + if (NULL == kernels[vectorSize][0]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error ); + vlog_error( + "\t\tFAILED -- Failed to create double kernel. (%d)\n", + error); return error; } - if(g_arrVecSizes[vectorSize] == 3) { - doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) ); - } else { - doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) ); - } - if( NULL == doublePrograms[ vectorSize ][2] ) + if (g_arrVecSizes[vectorSize] == 3) + doublePrograms[vectorSize][1] = MakeProgram( + device, double_source_private_store_v3, + sizeof(double_source_private_store_v3) + / sizeof(double_source_private_store_v3[0])); + else + doublePrograms[vectorSize][1] = + MakeProgram(device, double_source_private_store, + sizeof(double_source_private_store) + / sizeof(double_source_private_store[0])); + + if (NULL == doublePrograms[vectorSize][1]) { gFailCount++; return -1; } - doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error ); - if( NULL == kernels[vectorSize][2] ) + doubleKernels[vectorSize][1] = + clCreateKernel(doublePrograms[vectorSize][1], "test", &error); + if (NULL == kernels[vectorSize][1]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create double private " + "kernel. (%d)\n", + error); + return error; + } + + if (g_arrVecSizes[vectorSize] == 3) + { + doublePrograms[vectorSize][2] = + MakeProgram(device, double_source_local_store_v3, + sizeof(double_source_local_store_v3) + / sizeof(double_source_local_store_v3[0])); + } + else + { + doublePrograms[vectorSize][2] = + MakeProgram(device, double_source_local_store, + sizeof(double_source_local_store) + / sizeof(double_source_local_store[0])); + } + if (NULL == doublePrograms[vectorSize][2]) + { + gFailCount++; + return -1; + } + + doubleKernels[vectorSize][2] = + clCreateKernel(doublePrograms[vectorSize][2], "test", &error); + if (NULL == kernels[vectorSize][2]) + { + gFailCount++; + vlog_error("\t\tFAILED -- Failed to create double local " + "kernel. (%d)\n", + error); return error; } } } // end for vector size + const char *reset[] = { + "__kernel void reset( __global float *p, __global ushort *f,\n" + " uint extra_last_thread)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " *(f + i) = 0xdead;" + "}\n" + }; + + if (!gHostReset) + { + resetProgram = + MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0])); + if (NULL == resetProgram) + { + gFailCount++; + return -1; + } + resetKernel = clCreateKernel(resetProgram, "reset", &error); + if (NULL == resetKernel) + { + gFailCount++; + return -1; + } + } + // Figure out how many elements are in a work block size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float)); size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2 - uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats. + uint64_t lastCase = 1ULL << (8 * sizeof(float)); // number of floats. size_t stride = blockCount; if (gWimpyMode) stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor; // we handle 64-bit types a bit differently. - if( lastCase == 0 ) - lastCase = 0x100000000ULL; + if (lastCase == 0) lastCase = 0x100000000ULL; uint64_t i, j; error = 0; @@ -746,7 +900,7 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR dchk.lim = blockCount; dchk.count = (blockCount + threadCount - 1) / threadCount; - for( i = 0; i < lastCase; i += stride ) + for (i = 0; i < lastCase; i += stride) { count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i); fref.i = i; @@ -755,50 +909,71 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR // Compute the input and reference ThreadPool_Do(ReferenceF, threadCount, &fref); - error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteBuffer\n" ); + error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, + count * sizeof(float), gIn_single, 0, NULL, + NULL); + if (error) + { + vlog_error("Failure in clWriteBuffer\n"); gFailCount++; goto exit; } - if (gTestDouble) { + if (gTestDouble) + { ThreadPool_Do(ReferenceD, threadCount, &dref); - error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteBuffer\n" ); + error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, + count * sizeof(double), gIn_double, 0, + NULL, NULL); + if (error) + { + vlog_error("Failure in clWriteBuffer\n"); gFailCount++; goto exit; } } - for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) { + for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + { // Loop through vector sizes fchk.vsz = g_arrVecSizes[vectorSize]; dchk.vsz = g_arrVecSizes[vectorSize]; - for ( addressSpace = 0; addressSpace < 3; addressSpace++) { + for (addressSpace = 0; addressSpace < 3; addressSpace++) + { // Loop over address spaces fchk.aspace = addressSpaceNames[addressSpace]; dchk.aspace = addressSpaceNames[addressSpace]; - cl_uint pattern = 0xdeaddead; - memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2); + if (!gHostReset) + { + error = RunKernel(device, resetKernel, gInBuffer_single, + gOutBuffer_half, count, 0); + } + else + { + cl_uint pattern = 0xdeaddead; + memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); - error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, - 0, count * sizeof(cl_half), - gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteArray\n" ); + error = clEnqueueWriteBuffer( + gQueue, gOutBuffer_half, CL_FALSE, 0, + count * sizeof(cl_half), gOut_half, 0, NULL, NULL); + } + if (error) + { + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } - error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half, - numVecs(count, vectorSize, aligned) , + error = RunKernel(device, kernels[vectorSize][addressSpace], + gInBuffer_single, gOutBuffer_half, + numVecs(count, vectorSize, aligned), runsOverBy(count, vectorSize, aligned)); - if (error) { + if (error) + { gFailCount++; goto exit; } @@ -806,34 +981,51 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clReadArray\n" ); + if (error) + { + vlog_error("Failure in clReadArray\n"); gFailCount++; goto exit; } error = ThreadPool_Do(CheckF, threadCount, &fchk); - if (error) { - gFailCount++; - goto exit; - } + if (error) + { + gFailCount++; + goto exit; + } - if (gTestDouble) { - memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2); + if (gTestDouble) + { - error = clEnqueueWriteBuffer( - gQueue, gOutBuffer_half, CL_FALSE, 0, - count * sizeof(cl_half), gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteArray\n" ); + if (!gHostReset) + { + error = RunKernel(device, resetKernel, gInBuffer_double, + gOutBuffer_half, count, 0); + } + else + { + cl_uint pattern = 0xdeaddead; + memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); + + error = clEnqueueWriteBuffer( + gQueue, gOutBuffer_half, CL_FALSE, 0, + count * sizeof(cl_half), gOut_half, 0, NULL, NULL); + } + if (error) + { + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } - error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half, + error = RunKernel(device, + doubleKernels[vectorSize][addressSpace], + gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned), runsOverBy(count, vectorSize, aligned)); - if (error) { + if (error) + { gFailCount++; goto exit; } @@ -841,148 +1033,185 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR error = clEnqueueReadBuffer( gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clReadArray\n" ); + if (error) + { + vlog_error("Failure in clReadArray\n"); gFailCount++; goto exit; } + error = ThreadPool_Do(CheckD, threadCount, &dchk); - if (error) { - gFailCount++; - goto exit; - } - } + if (error) + { + gFailCount++; + goto exit; } } - - if( ((i+blockCount) & ~printMask) == (i+blockCount) ) - { - vlog( "." ); - fflush( stdout ); + } } - } // end last case + + if (((i + blockCount) & ~printMask) == (i + blockCount)) + { + vlog("."); + fflush(stdout); + } + } // end last case loopCount = count == blockCount ? 1 : 100; - if( gReportTimes ) + if (gReportTimes) { - //Init the input stream + // Init the input stream cl_float *p = (cl_float *)gIn_single; - for( j = 0; j < count; j++ ) - p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2)); + for (j = 0; j < count; j++) + p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) ) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, + count * sizeof(float), gIn_single, 0, + NULL, NULL))) { - vlog_error( "Failure in clWriteArray\n" ); + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } - if( gTestDouble ) + if (gTestDouble) { - //Init the input stream + // Init the input stream cl_double *q = (cl_double *)gIn_double; - for( j = 0; j < count; j++ ) - q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2)); + for (j = 0; j < count; j++) + q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) ) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, + 0, count * sizeof(double), + gIn_double, 0, NULL, NULL))) { - vlog_error( "Failure in clWriteArray\n" ); + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } } - //Run again for timing - for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) + // Run again for timing + for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) { uint64_t bestTime = -1ULL; - for( j = 0; j < loopCount; j++ ) + for (j = 0; j < loopCount; j++) { uint64_t startTime = ReadTime(); - if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) , - runsOverBy(count, vectorSize, aligned)) ) ) + if ((error = RunKernel(device, kernels[vectorSize][0], + gInBuffer_single, gOutBuffer_half, + numVecs(count, vectorSize, aligned), + runsOverBy(count, vectorSize, aligned)))) { gFailCount++; goto exit; } - if( (error = clFinish(gQueue)) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Failure in clFinish\n" ); + vlog_error("Failure in clFinish\n"); gFailCount++; goto exit; } uint64_t currentTime = ReadTime() - startTime; - if( currentTime < bestTime ) - bestTime = currentTime; - time[ vectorSize ] += currentTime; + if (currentTime < bestTime) bestTime = currentTime; + time[vectorSize] += currentTime; } - if( bestTime < min_time[ vectorSize ] ) - min_time[ vectorSize ] = bestTime ; + if (bestTime < min_time[vectorSize]) + min_time[vectorSize] = bestTime; - if( gTestDouble ) + if (gTestDouble) { bestTime = -1ULL; - for( j = 0; j < loopCount; j++ ) + for (j = 0; j < loopCount; j++) { uint64_t startTime = ReadTime(); - if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) , - runsOverBy(count, vectorSize, aligned)) ) ) + if ((error = + RunKernel(device, doubleKernels[vectorSize][0], + gInBuffer_double, gOutBuffer_half, + numVecs(count, vectorSize, aligned), + runsOverBy(count, vectorSize, aligned)))) { gFailCount++; goto exit; } - if( (error = clFinish(gQueue)) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Failure in clFinish\n" ); + vlog_error("Failure in clFinish\n"); gFailCount++; goto exit; } uint64_t currentTime = ReadTime() - startTime; - if( currentTime < bestTime ) - bestTime = currentTime; - doubleTime[ vectorSize ] += currentTime; + if (currentTime < bestTime) bestTime = currentTime; + doubleTime[vectorSize] += currentTime; } - if( bestTime < min_double_time[ vectorSize ] ) - min_double_time[ vectorSize ] = bestTime; + if (bestTime < min_double_time[vectorSize]) + min_double_time[vectorSize] = bestTime; } } } - if( gReportTimes ) + if (gReportTimes) { - for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, - "average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); - for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, - "best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); - if( gTestDouble ) + for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency + * gComputeDevices / (double)(count * loopCount), + 0, "average us/elem", + "vStoreHalf%s avg. (%s vector size: %d)", roundName, + addressSpaceNames[0], (g_arrVecSizes[vectorSize])); + for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6 + * gDeviceFrequency * gComputeDevices / (double)count, + 0, "best us/elem", + "vStoreHalf%s best (%s vector size: %d)", roundName, + addressSpaceNames[0], (g_arrVecSizes[vectorSize])); + if (gTestDouble) { - for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, - "average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); - for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, - "best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); + for (vectorSize = kMinVectorSize; + vectorSize < kLastVectorSizeToTest; vectorSize++) + vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6 + * gDeviceFrequency * gComputeDevices + / (double)(count * loopCount), + 0, "average us/elem (double)", + "vStoreHalf%s avg. d (%s vector size: %d)", roundName, + addressSpaceNames[0], (g_arrVecSizes[vectorSize])); + for (vectorSize = kMinVectorSize; + vectorSize < kLastVectorSizeToTest; vectorSize++) + vlog_perf(SubtractTime(min_double_time[vectorSize], 0) * 1e6 + * gDeviceFrequency * gComputeDevices + / (double)count, + 0, "best us/elem (double)", + "vStoreHalf%s best d (%s vector size: %d)", roundName, + addressSpaceNames[0], (g_arrVecSizes[vectorSize])); } } exit: - //clean up - for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) + // clean up + if (!gHostReset) { - for ( addressSpace = 0; addressSpace < 3; addressSpace++) { - clReleaseKernel( kernels[ vectorSize ][ addressSpace ] ); - clReleaseProgram( programs[ vectorSize ][ addressSpace ] ); - if( gTestDouble ) + clReleaseKernel(resetKernel); + clReleaseProgram(resetProgram); + } + + for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + { + for (addressSpace = 0; addressSpace < 3; addressSpace++) + { + clReleaseKernel(kernels[vectorSize][addressSpace]); + clReleaseProgram(programs[vectorSize][addressSpace]); + if (gTestDouble) { - clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] ); - clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] ); + clReleaseKernel(doubleKernels[vectorSize][addressSpace]); + clReleaseProgram(doublePrograms[vectorSize][addressSpace]); } } } @@ -990,321 +1219,495 @@ exit: return error; } -int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName ) +int Test_vStoreaHalf_private(cl_device_id device, f2h referenceFunc, + d2h doubleReferenceFunc, const char *roundName) { int vectorSize, error; - cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3]; - cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3]; + cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3]; + cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; + cl_program resetProgram; + cl_kernel resetKernel; - uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - memset( min_time, -1, sizeof( min_time ) ); - cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3]; - cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3]; - uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0}; - memset( min_double_time, -1, sizeof( min_double_time ) ); + uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; + uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; + memset(min_time, -1, sizeof(min_time)); + cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3]; + cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3]; + uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 }; + uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = { + 0 + }; + memset(min_double_time, -1, sizeof(min_double_time)); bool aligned = true; int minVectorSize = kMinVectorSize; // There is no aligned scalar vstorea_half - if( 0 == minVectorSize ) - minVectorSize = 1; + if (0 == minVectorSize) minVectorSize = 1; - //Loop over vector sizes - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) + // Loop over vector sizes + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) { - const char *source[] = { - "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n" - "}\n" - }; + const char *source[] = { "__kernel void test( __global float", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" + "{\n" + " size_t i = get_global_id(0);\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], i, f );\n" + "}\n" }; const char *source_v3[] = { "__kernel void test( __global float3 *p, __global half *f )\n" "{\n" " size_t i = get_global_id(0);\n" - " vstorea_half3",roundName,"( p[i], i, f );\n" - " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n" + " vstorea_half3", + roundName, + "( p[i], i, f );\n" + " vstore_half", + roundName, + "( ((__global float *)p)[4*i+3], 4*i+3, f);\n" "}\n" }; const char *source_private[] = { - "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global float", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __private float", vector_size_name_extensions[vectorSize], " data;\n" + " __private float", + vector_size_name_extensions[vectorSize], + " data;\n" " size_t i = get_global_id(0);\n" " data = p[i];\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( data, i, f );\n" "}\n" }; const char *source_private_v3[] = { "__kernel void test( __global float3 *p, __global half *f )\n" "{\n" - " __private float", vector_size_name_extensions[vectorSize], " data;\n" + " __private float", + vector_size_name_extensions[vectorSize], + " data;\n" " size_t i = get_global_id(0);\n" " data = p[i];\n" - " vstorea_half3",roundName,"( data, i, f );\n" - " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n" + " vstorea_half3", + roundName, + "( data, i, f );\n" + " vstore_half", + roundName, + "( ((__global float *)p)[4*i+3], 4*i+3, f);\n" "}\n" }; char local_buf_size[10]; sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize); - const char *source_local[] = { - "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" - "{\n" - " __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n" - " size_t i = get_global_id(0);\n" - " size_t lid = get_local_id(0);\n" - " data[lid] = p[i];\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n" - "}\n" - }; + const char *source_local[] = { "__kernel void test( __global float", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" + "{\n" + " __local float", + vector_size_name_extensions[vectorSize], + " data[", + local_buf_size, + "];\n" + " size_t i = get_global_id(0);\n" + " size_t lid = get_local_id(0);\n" + " data[lid] = p[i];\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( data[lid], i, f );\n" + "}\n" }; const char *source_local_v3[] = { - "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global float", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n" + " __local float", + vector_size_name_extensions[vectorSize], + " data[", + local_buf_size, + "];\n" " size_t i = get_global_id(0);\n" " size_t lid = get_local_id(0);\n" " data[lid] = p[i];\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n" - " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( data[lid], i, f );\n" + " vstore_half", + roundName, + "( ((__global float *)p)[4*i+3], 4*i+3, f);\n" "}\n" }; const char *double_source[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" " size_t i = get_global_id(0);\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], i, f );\n" "}\n" }; const char *double_source_v3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" " size_t i = get_global_id(0);\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n" - " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( p[i], i, f );\n" + " vstore_half", + roundName, + "( ((__global double *)p)[4*i+3], 4*i+3, f);\n" "}\n" }; const char *double_source_private[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __private double", vector_size_name_extensions[vectorSize], " data;\n" + " __private double", + vector_size_name_extensions[vectorSize], + " data;\n" " size_t i = get_global_id(0);\n" " data = p[i];\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( data, i, f );\n" "}\n" }; const char *double_source_private_v3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __private double", vector_size_name_extensions[vectorSize], " data;\n" + " __private double", + vector_size_name_extensions[vectorSize], + " data;\n" " size_t i = get_global_id(0);\n" " data = p[i];\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n" - " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( data, i, f );\n" + " vstore_half", + roundName, + "( ((__global double *)p)[4*i+3], 4*i+3, f);\n" "}\n" }; const char *double_source_local[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n" + " __local double", + vector_size_name_extensions[vectorSize], + " data[", + local_buf_size, + "];\n" " size_t i = get_global_id(0);\n" " size_t lid = get_local_id(0);\n" " data[lid] = p[i];\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( data[lid], i, f );\n" "}\n" }; const char *double_source_local_v3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n" + "__kernel void test( __global double", + vector_size_name_extensions[vectorSize], + " *p, __global half *f )\n" "{\n" - " __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n" + " __local double", + vector_size_name_extensions[vectorSize], + " data[", + local_buf_size, + "];\n" " size_t i = get_global_id(0);\n" " size_t lid = get_local_id(0);\n" " data[lid] = p[i];\n" - " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n" - " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n" + " vstorea_half", + vector_size_name_extensions[vectorSize], + roundName, + "( data[lid], i, f );\n" + " vstore_half", + roundName, + "( ((__global double *)p)[4*i+3], 4*i+3, f);\n" "}\n" }; - if(g_arrVecSizes[vectorSize] == 3) { - programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) ); - if( NULL == programs[ vectorSize ][0] ) + if (g_arrVecSizes[vectorSize] == 3) + { + programs[vectorSize][0] = MakeProgram( + device, source_v3, sizeof(source_v3) / sizeof(source_v3[0])); + if (NULL == programs[vectorSize][0]) { gFailCount++; return -1; } - } else { - programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) ); - if( NULL == programs[ vectorSize ][0] ) + } + else + { + programs[vectorSize][0] = + MakeProgram(device, source, sizeof(source) / sizeof(source[0])); + if (NULL == programs[vectorSize][0]) { gFailCount++; return -1; } } - kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error ); - if( NULL == kernels[vectorSize][0] ) + kernels[vectorSize][0] = + clCreateKernel(programs[vectorSize][0], "test", &error); + if (NULL == kernels[vectorSize][0]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error); return error; } - if(g_arrVecSizes[vectorSize] == 3) { - programs[vectorSize][1] = MakeProgram( device, source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) ); - if( NULL == programs[ vectorSize ][1] ) + if (g_arrVecSizes[vectorSize] == 3) + { + programs[vectorSize][1] = MakeProgram( + device, source_private_v3, + sizeof(source_private_v3) / sizeof(source_private_v3[0])); + if (NULL == programs[vectorSize][1]) { gFailCount++; return -1; } - } else { - programs[vectorSize][1] = MakeProgram( device, source_private, sizeof(source_private) / sizeof( source_private[0]) ); - if( NULL == programs[ vectorSize ][1] ) + } + else + { + programs[vectorSize][1] = + MakeProgram(device, source_private, + sizeof(source_private) / sizeof(source_private[0])); + if (NULL == programs[vectorSize][1]) { gFailCount++; return -1; } } - kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error ); - if( NULL == kernels[vectorSize][1] ) + kernels[vectorSize][1] = + clCreateKernel(programs[vectorSize][1], "test", &error); + if (NULL == kernels[vectorSize][1]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n", + error); return error; } - if(g_arrVecSizes[vectorSize] == 3) { - programs[vectorSize][2] = MakeProgram( device, source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) ); - if( NULL == programs[ vectorSize ][2] ) + if (g_arrVecSizes[vectorSize] == 3) + { + programs[vectorSize][2] = MakeProgram( + device, source_local_v3, + sizeof(source_local_v3) / sizeof(source_local_v3[0])); + if (NULL == programs[vectorSize][2]) { gFailCount++; return -1; } - } else { - programs[vectorSize][2] = MakeProgram( device, source_local, sizeof(source_local) / sizeof( source_local[0]) ); - if( NULL == programs[ vectorSize ][2] ) + } + else + { + programs[vectorSize][2] = + MakeProgram(device, source_local, + sizeof(source_local) / sizeof(source_local[0])); + if (NULL == programs[vectorSize][2]) { gFailCount++; return -1; } } - kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error ); - if( NULL == kernels[vectorSize][2] ) + kernels[vectorSize][2] = + clCreateKernel(programs[vectorSize][2], "test", &error); + if (NULL == kernels[vectorSize][2]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n", + error); return error; } - if( gTestDouble ) + if (gTestDouble) { - if(g_arrVecSizes[vectorSize] == 3) { - doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) ); - if( NULL == doublePrograms[ vectorSize ][0] ) + if (g_arrVecSizes[vectorSize] == 3) + { + doublePrograms[vectorSize][0] = MakeProgram( + device, double_source_v3, + sizeof(double_source_v3) / sizeof(double_source_v3[0])); + if (NULL == doublePrograms[vectorSize][0]) { gFailCount++; return -1; } - } else { - doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) ); - if( NULL == doublePrograms[ vectorSize ][0] ) + } + else + { + doublePrograms[vectorSize][0] = MakeProgram( + device, double_source, + sizeof(double_source) / sizeof(double_source[0])); + if (NULL == doublePrograms[vectorSize][0]) { gFailCount++; return -1; } } - doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error ); - if( NULL == kernels[vectorSize][0] ) + doubleKernels[vectorSize][0] = + clCreateKernel(doublePrograms[vectorSize][0], "test", &error); + if (NULL == kernels[vectorSize][0]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error ); + vlog_error( + "\t\tFAILED -- Failed to create double kernel. (%d)\n", + error); return error; } - if(g_arrVecSizes[vectorSize] == 3) { - doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) ); - if( NULL == doublePrograms[ vectorSize ][1] ) + if (g_arrVecSizes[vectorSize] == 3) + { + doublePrograms[vectorSize][1] = + MakeProgram(device, double_source_private_v3, + sizeof(double_source_private_v3) + / sizeof(double_source_private_v3[0])); + if (NULL == doublePrograms[vectorSize][1]) { gFailCount++; return -1; } - } else { - doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) ); - if( NULL == doublePrograms[ vectorSize ][1] ) + } + else + { + doublePrograms[vectorSize][1] = + MakeProgram(device, double_source_private, + sizeof(double_source_private) + / sizeof(double_source_private[0])); + if (NULL == doublePrograms[vectorSize][1]) { gFailCount++; return -1; } } - doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error ); - if( NULL == kernels[vectorSize][1] ) + doubleKernels[vectorSize][1] = + clCreateKernel(doublePrograms[vectorSize][1], "test", &error); + if (NULL == kernels[vectorSize][1]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create double private " + "kernel. (%d)\n", + error); return error; } - if(g_arrVecSizes[vectorSize] == 3) { - doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) ); - if( NULL == doublePrograms[ vectorSize ][2] ) + if (g_arrVecSizes[vectorSize] == 3) + { + doublePrograms[vectorSize][2] = + MakeProgram(device, double_source_local_v3, + sizeof(double_source_local_v3) + / sizeof(double_source_local_v3[0])); + if (NULL == doublePrograms[vectorSize][2]) { gFailCount++; return -1; } - } else { - doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) ); - if( NULL == doublePrograms[ vectorSize ][2] ) + } + else + { + doublePrograms[vectorSize][2] = + MakeProgram(device, double_source_local, + sizeof(double_source_local) + / sizeof(double_source_local[0])); + if (NULL == doublePrograms[vectorSize][2]) { gFailCount++; return -1; } } - doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error ); - if( NULL == kernels[vectorSize][2] ) + doubleKernels[vectorSize][2] = + clCreateKernel(doublePrograms[vectorSize][2], "test", &error); + if (NULL == kernels[vectorSize][2]) { gFailCount++; - vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error ); + vlog_error("\t\tFAILED -- Failed to create double local " + "kernel. (%d)\n", + error); return error; } } } + const char *reset[] = { + "__kernel void reset( __global float *p, __global ushort *f,\n" + " uint extra_last_thread)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " *(f + i) = 0xdead;" + "}\n" + }; + + if (!gHostReset) + { + resetProgram = + MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0])); + if (NULL == resetProgram) + { + gFailCount++; + return -1; + } + resetKernel = clCreateKernel(resetProgram, "reset", &error); + if (NULL == resetKernel) + { + gFailCount++; + return -1; + } + } + // Figure out how many elements are in a work block size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float)); size_t blockCount = BUFFER_SIZE / elementSize; - uint64_t lastCase = 1ULL << (8*sizeof(float)); + uint64_t lastCase = 1ULL << (8 * sizeof(float)); size_t stride = blockCount; if (gWimpyMode) stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor; // we handle 64-bit types a bit differently. - if( lastCase == 0 ) - lastCase = 0x100000000ULL; + if (lastCase == 0) lastCase = 0x100000000ULL; uint64_t i, j; error = 0; uint64_t printMask = (lastCase >> 4) - 1; @@ -1343,7 +1746,7 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double dchk.lim = blockCount; dchk.count = (blockCount + threadCount - 1) / threadCount; - for( i = 0; i < (uint64_t)lastCase; i += stride ) + for (i = 0; i < (uint64_t)lastCase; i += stride) { count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i); fref.i = i; @@ -1352,50 +1755,71 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double // Create the input and reference ThreadPool_Do(ReferenceF, threadCount, &fref); - error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteArray\n" ); + error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, + count * sizeof(float), gIn_single, 0, NULL, + NULL); + if (error) + { + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } - if (gTestDouble) { + if (gTestDouble) + { ThreadPool_Do(ReferenceD, threadCount, &dref); - error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteArray\n" ); + error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, + count * sizeof(double), gIn_double, 0, + NULL, NULL); + if (error) + { + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } } - for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) { + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + { // Loop over vector legths fchk.vsz = g_arrVecSizes[vectorSize]; dchk.vsz = g_arrVecSizes[vectorSize]; - for ( addressSpace = 0; addressSpace < 3; addressSpace++) { + for (addressSpace = 0; addressSpace < 3; addressSpace++) + { // Loop over address spaces fchk.aspace = addressSpaceNames[addressSpace]; dchk.aspace = addressSpaceNames[addressSpace]; - cl_uint pattern = 0xdeaddead; - memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2); + if (!gHostReset) + { + error = RunKernel(device, resetKernel, gInBuffer_single, + gOutBuffer_half, count, 0); + } + else + { + cl_uint pattern = 0xdeaddead; + memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); - error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE, - 0, count * sizeof(cl_half), - gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteArray\n" ); + error = clEnqueueWriteBuffer( + gQueue, gOutBuffer_half, CL_FALSE, 0, + count * sizeof(cl_half), gOut_half, 0, NULL, NULL); + } + if (error) + { + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } - error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half, + error = RunKernel(device, kernels[vectorSize][addressSpace], + gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned), runsOverBy(count, vectorSize, aligned)); - if (error) { + if (error) + { gFailCount++; goto exit; } @@ -1403,34 +1827,51 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clReadArray\n" ); + if (error) + { + vlog_error("Failure in clReadArray\n"); gFailCount++; goto exit; } error = ThreadPool_Do(CheckF, threadCount, &fchk); - if (error) { - gFailCount++; - goto exit; - } + if (error) + { + gFailCount++; + goto exit; + } - if (gTestDouble) { - memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2); + if (gTestDouble) + { - error = clEnqueueWriteBuffer( - gQueue, gOutBuffer_half, CL_FALSE, 0, - count * sizeof(cl_half), gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clWriteArray\n" ); + if (!gHostReset) + { + error = RunKernel(device, resetKernel, gInBuffer_single, + gOutBuffer_half, count, 0); + } + else + { + cl_uint pattern = 0xdeaddead; + memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2); + + error = clEnqueueWriteBuffer( + gQueue, gOutBuffer_half, CL_FALSE, 0, + count * sizeof(cl_half), gOut_half, 0, NULL, NULL); + } + if (error) + { + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } - error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half, + error = RunKernel(device, + doubleKernels[vectorSize][addressSpace], + gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned), runsOverBy(count, vectorSize, aligned)); - if (error) { + if (error) + { gFailCount++; goto exit; } @@ -1438,149 +1879,186 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double error = clEnqueueReadBuffer( gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL); - if (error) { - vlog_error( "Failure in clReadArray\n" ); + if (error) + { + vlog_error("Failure in clReadArray\n"); gFailCount++; goto exit; } error = ThreadPool_Do(CheckD, threadCount, &dchk); - if (error) { - gFailCount++; - goto exit; - } - } + if (error) + { + gFailCount++; + goto exit; } - } // end for vector size + } + } + } // end for vector size - if( ((i+blockCount) & ~printMask) == (i+blockCount) ) { - vlog( "." ); - fflush( stdout ); + if (((i + blockCount) & ~printMask) == (i + blockCount)) + { + vlog("."); + fflush(stdout); } - } // for end lastcase + } // for end lastcase loopCount = count == blockCount ? 1 : 100; - if( gReportTimes ) + if (gReportTimes) { - //Init the input stream + // Init the input stream cl_float *p = (cl_float *)gIn_single; - for( j = 0; j < count; j++ ) - p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2)); + for (j = 0; j < count; j++) + p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) ) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, + count * sizeof(float), gIn_single, 0, + NULL, NULL))) { - vlog_error( "Failure in clWriteArray\n" ); + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } - if( gTestDouble ) + if (gTestDouble) { - //Init the input stream + // Init the input stream cl_double *q = (cl_double *)gIn_double; - for( j = 0; j < count; j++ ) - q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2)); + for (j = 0; j < count; j++) + q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2)); - if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) ) + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, + 0, count * sizeof(double), + gIn_double, 0, NULL, NULL))) { - vlog_error( "Failure in clWriteArray\n" ); + vlog_error("Failure in clWriteArray\n"); gFailCount++; goto exit; } } - //Run again for timing - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) + // Run again for timing + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) { uint64_t bestTime = -1ULL; - for( j = 0; j < loopCount; j++ ) + for (j = 0; j < loopCount; j++) { uint64_t startTime = ReadTime(); - if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) , - runsOverBy(count, vectorSize, aligned)) ) ) + if ((error = RunKernel(device, kernels[vectorSize][0], + gInBuffer_single, gOutBuffer_half, + numVecs(count, vectorSize, aligned), + runsOverBy(count, vectorSize, aligned)))) { gFailCount++; goto exit; } - if( (error = clFinish(gQueue)) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Failure in clFinish\n" ); + vlog_error("Failure in clFinish\n"); gFailCount++; goto exit; } uint64_t currentTime = ReadTime() - startTime; - if( currentTime < bestTime ) - bestTime = currentTime; - time[ vectorSize ] += currentTime; + if (currentTime < bestTime) bestTime = currentTime; + time[vectorSize] += currentTime; } - if( bestTime < min_time[ vectorSize ] ) - min_time[ vectorSize ] = bestTime ; + if (bestTime < min_time[vectorSize]) + min_time[vectorSize] = bestTime; - if( gTestDouble ) + if (gTestDouble) { bestTime = -1ULL; - for( j = 0; j < loopCount; j++ ) + for (j = 0; j < loopCount; j++) { uint64_t startTime = ReadTime(); - if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) , - runsOverBy(count, vectorSize, aligned)) ) ) + if ((error = + RunKernel(device, doubleKernels[vectorSize][0], + gInBuffer_double, gOutBuffer_half, + numVecs(count, vectorSize, aligned), + runsOverBy(count, vectorSize, aligned)))) { gFailCount++; goto exit; } - if( (error = clFinish(gQueue)) ) + if ((error = clFinish(gQueue))) { - vlog_error( "Failure in clFinish\n" ); + vlog_error("Failure in clFinish\n"); gFailCount++; goto exit; } uint64_t currentTime = ReadTime() - startTime; - if( currentTime < bestTime ) - bestTime = currentTime; - doubleTime[ vectorSize ] += currentTime; + if (currentTime < bestTime) bestTime = currentTime; + doubleTime[vectorSize] += currentTime; } - if( bestTime < min_double_time[ vectorSize ] ) - min_double_time[ vectorSize ] = bestTime; + if (bestTime < min_double_time[vectorSize]) + min_double_time[vectorSize] = bestTime; } } } - if( gReportTimes ) + if (gReportTimes) { - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, - "average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, - "best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); - if( gTestDouble ) + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency + * gComputeDevices / (double)(count * loopCount), + 0, "average us/elem", + "vStoreaHalf%s avg. (%s vector size: %d)", roundName, + addressSpaceNames[0], (g_arrVecSizes[vectorSize])); + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6 + * gDeviceFrequency * gComputeDevices / (double)count, + 0, "best us/elem", + "vStoreaHalf%s best (%s vector size: %d)", roundName, + addressSpaceNames[0], (g_arrVecSizes[vectorSize])); + if (gTestDouble) { - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, - "average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) - vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, - "best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) ); + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6 + * gDeviceFrequency * gComputeDevices + / (double)(count * loopCount), + 0, "average us/elem (double)", + "vStoreaHalf%s avg. d (%s vector size: %d)", + roundName, addressSpaceNames[0], + (g_arrVecSizes[vectorSize])); + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + vlog_perf( + SubtractTime(min_double_time[vectorSize], 0) * 1e6 + * gDeviceFrequency * gComputeDevices / (double)count, + 0, "best us/elem (double)", + "vStoreaHalf%s best d (%s vector size: %d)", roundName, + addressSpaceNames[0], (g_arrVecSizes[vectorSize])); } } exit: - //clean up - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) + // clean up + if (!gHostReset) { - for ( addressSpace = 0; addressSpace < 3; addressSpace++) { - clReleaseKernel( kernels[ vectorSize ][addressSpace] ); - clReleaseProgram( programs[ vectorSize ][addressSpace] ); - if( gTestDouble ) + clReleaseKernel(resetKernel); + clReleaseProgram(resetProgram); + } + + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) + { + for (addressSpace = 0; addressSpace < 3; addressSpace++) + { + clReleaseKernel(kernels[vectorSize][addressSpace]); + clReleaseProgram(programs[vectorSize][addressSpace]); + if (gTestDouble) { - clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] ); - clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] ); + clReleaseKernel(doubleKernels[vectorSize][addressSpace]); + clReleaseProgram(doublePrograms[vectorSize][addressSpace]); } } } return error; } - diff --git a/test_conformance/half/cl_utils.cpp b/test_conformance/half/cl_utils.cpp index 68f7b9cd..04770210 100644 --- a/test_conformance/half/cl_utils.cpp +++ b/test_conformance/half/cl_utils.cpp @@ -35,37 +35,38 @@ const char *align_divisors[kVectorSizeCount+kStrangeVectorSizeCount] = { "1", "2 const char *align_types[kVectorSizeCount+kStrangeVectorSizeCount] = { "half", "int", "int2", "int4", "int8", "int2" }; -void *gIn_half = NULL; -void *gOut_half = NULL; -void *gOut_half_reference = NULL; -void *gOut_half_reference_double = NULL; -void *gIn_single = NULL; -void *gOut_single = NULL; -void *gOut_single_reference = NULL; -void *gIn_double = NULL; -// void *gOut_double = NULL; -// void *gOut_double_reference = NULL; -cl_mem gInBuffer_half = NULL; -cl_mem gOutBuffer_half = NULL; -cl_mem gInBuffer_single = NULL; -cl_mem gOutBuffer_single = NULL; -cl_mem gInBuffer_double = NULL; -// cl_mem gOutBuffer_double = NULL; +void *gIn_half = NULL; +void *gOut_half = NULL; +void *gOut_half_reference = NULL; +void *gOut_half_reference_double = NULL; +void *gIn_single = NULL; +void *gOut_single = NULL; +void *gOut_single_reference = NULL; +void *gIn_double = NULL; +// void *gOut_double = NULL; +// void *gOut_double_reference = NULL; +cl_mem gInBuffer_half = NULL; +cl_mem gOutBuffer_half = NULL; +cl_mem gInBuffer_single = NULL; +cl_mem gOutBuffer_single = NULL; +cl_mem gInBuffer_double = NULL; +// cl_mem gOutBuffer_double = NULL; -cl_context gContext = NULL; +cl_context gContext = NULL; cl_command_queue gQueue = NULL; -uint32_t gDeviceFrequency = 0; -uint32_t gComputeDevices = 0; -size_t gMaxThreadGroupSize = 0; -size_t gWorkGroupSize = 0; -bool gWimpyMode = false; -int gWimpyReductionFactor = 512; -int gTestDouble = 0; +uint32_t gDeviceFrequency = 0; +uint32_t gComputeDevices = 0; +size_t gMaxThreadGroupSize = 0; +size_t gWorkGroupSize = 0; +bool gWimpyMode = false; +int gWimpyReductionFactor = 512; +int gTestDouble = 0; +bool gHostReset = false; #if defined( __APPLE__ ) -int gReportTimes = 1; +int gReportTimes = 1; #else -int gReportTimes = 0; +int gReportTimes = 0; #endif #pragma mark - diff --git a/test_conformance/half/cl_utils.h b/test_conformance/half/cl_utils.h index 50d8af3d..da6073cf 100644 --- a/test_conformance/half/cl_utils.h +++ b/test_conformance/half/cl_utils.h @@ -44,37 +44,38 @@ #include #endif -extern void *gIn_half; -extern void *gOut_half; -extern void *gOut_half_reference; -extern void *gOut_half_reference_double; -extern void *gIn_single; -extern void *gOut_single; -extern void *gOut_single_reference; -extern void *gIn_double; -// extern void *gOut_double; -// extern void *gOut_double_reference; -extern cl_mem gInBuffer_half; -extern cl_mem gOutBuffer_half; -extern cl_mem gInBuffer_single; -extern cl_mem gOutBuffer_single; -extern cl_mem gInBuffer_double; -// extern cl_mem gOutBuffer_double; +extern void *gIn_half; +extern void *gOut_half; +extern void *gOut_half_reference; +extern void *gOut_half_reference_double; +extern void *gIn_single; +extern void *gOut_single; +extern void *gOut_single_reference; +extern void *gIn_double; +// extern void *gOut_double; +// extern void *gOut_double_reference; +extern cl_mem gInBuffer_half; +extern cl_mem gOutBuffer_half; +extern cl_mem gInBuffer_single; +extern cl_mem gOutBuffer_single; +extern cl_mem gInBuffer_double; +// extern cl_mem gOutBuffer_double; -extern cl_context gContext; +extern cl_context gContext; extern cl_command_queue gQueue; -extern uint32_t gDeviceFrequency; -extern uint32_t gComputeDevices; -extern size_t gMaxThreadGroupSize; -extern size_t gWorkGroupSize; -extern int gTestDouble; -extern int gReportTimes; +extern uint32_t gDeviceFrequency; +extern uint32_t gComputeDevices; +extern size_t gMaxThreadGroupSize; +extern size_t gWorkGroupSize; +extern int gTestDouble; +extern int gReportTimes; +extern bool gHostReset; // gWimpyMode indicates if we run the test in wimpy mode where we limit the // size of 32 bit ranges to a much smaller set. This is meant to be used // as a smoke test -extern bool gWimpyMode; -extern int gWimpyReductionFactor; +extern bool gWimpyMode; +extern int gWimpyReductionFactor; uint64_t ReadTime( void ); double SubtractTime( uint64_t endTime, uint64_t startTime ); diff --git a/test_conformance/half/main.cpp b/test_conformance/half/main.cpp index 6bc7db95..ee44fb2d 100644 --- a/test_conformance/half/main.cpp +++ b/test_conformance/half/main.cpp @@ -194,6 +194,8 @@ static int ParseArgs( int argc, const char **argv ) gReportTimes ^= 1; break; + case 'r': gHostReset = true; break; + case 'w': // Wimpy mode gWimpyMode = true; break; @@ -235,13 +237,17 @@ static int ParseArgs( int argc, const char **argv ) static void PrintUsage( void ) { - vlog( "%s [-dthw]: \n", appName ); - vlog( "\t\t-d\tToggle double precision testing (default: on if double supported)\n" ); - vlog( "\t\t-t\tToggle reporting performance data.\n" ); - vlog( "\t\t-w\tRun in wimpy mode\n" ); - vlog( "\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor); - vlog( "\t\t-h\tHelp\n" ); - for( int i = 0; i < test_num; i++ ) + vlog("%s [-dthw]: \n", appName); + vlog("\t\t-d\tToggle double precision testing (default: on if double " + "supported)\n"); + vlog("\t\t-t\tToggle reporting performance data.\n"); + vlog("\t\t-r\tReset buffers on host instead of on device.\n"); + vlog("\t\t-w\tRun in wimpy mode\n"); + vlog("\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is " + "1-12, default factor(%u)\n", + gWimpyReductionFactor); + vlog("\t\t-h\tHelp\n"); + for (int i = 0; i < test_num; i++) { vlog("\t\t%s\n", test_list[i].name ); }