diff --git a/test_conformance/basic/main.cpp b/test_conformance/basic/main.cpp index f6608958..e89b2747 100644 --- a/test_conformance/basic/main.cpp +++ b/test_conformance/basic/main.cpp @@ -99,6 +99,8 @@ test_definition test_list[] = { ADD_TEST(enqueue_map_image), ADD_TEST(work_item_functions), + ADD_TEST(work_item_functions_out_of_range), + ADD_TEST(work_item_functions_out_of_range_hardcoded), ADD_TEST(astype), diff --git a/test_conformance/basic/procs.h b/test_conformance/basic/procs.h index cf3e8c63..69529b1f 100644 --- a/test_conformance/basic/procs.h +++ b/test_conformance/basic/procs.h @@ -98,6 +98,13 @@ extern int test_enqueue_map_buffer(cl_device_id deviceID, cl_context contex extern int test_enqueue_map_image(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); extern int test_work_item_functions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); +extern int test_work_item_functions_out_of_range(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_work_item_functions_out_of_range_hardcoded( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); extern int test_astype(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); diff --git a/test_conformance/basic/test_work_item_functions.cpp b/test_conformance/basic/test_work_item_functions.cpp index d326bb8b..6c2b436f 100644 --- a/test_conformance/basic/test_work_item_functions.cpp +++ b/test_conformance/basic/test_work_item_functions.cpp @@ -15,6 +15,7 @@ // #include "harness/compat.h" +#include #include #include #include @@ -26,6 +27,8 @@ #include "harness/conversions.h" #include "harness/typeWrappers.h" +namespace { + struct work_item_data { cl_uint workDim; @@ -35,146 +38,542 @@ struct work_item_data cl_uint localID[ 3 ]; cl_uint numGroups[ 3 ]; cl_uint groupID[ 3 ]; + cl_uint globalOffset[3]; + cl_uint enqueuedLocalSize[3]; }; -static const char *workItemKernelCode = -"typedef struct {\n" -" uint workDim;\n" -" uint globalSize[ 3 ];\n" -" uint globalID[ 3 ];\n" -" uint localSize[ 3 ];\n" -" uint localID[ 3 ];\n" -" uint numGroups[ 3 ];\n" -" uint groupID[ 3 ];\n" -" } work_item_data;\n" -"\n" -"__kernel void sample_kernel( __global work_item_data *outData )\n" -"{\n" -" int id = get_global_id(0);\n" -" outData[ id ].workDim = (uint)get_work_dim();\n" -" for( uint i = 0; i < get_work_dim(); i++ )\n" -" {\n" -" outData[ id ].globalSize[ i ] = (uint)get_global_size( i );\n" -" outData[ id ].globalID[ i ] = (uint)get_global_id( i );\n" -" outData[ id ].localSize[ i ] = (uint)get_local_size( i );\n" -" outData[ id ].localID[ i ] = (uint)get_local_id( i );\n" -" outData[ id ].numGroups[ i ] = (uint)get_num_groups( i );\n" -" outData[ id ].groupID[ i ] = (uint)get_group_id( i );\n" -" }\n" -"}"; +const char *workItemKernelCode = + R"(typedef struct { + uint workDim; + uint globalSize[ 3 ]; + uint globalID[ 3 ]; + uint localSize[ 3 ]; + uint localID[ 3 ]; + uint numGroups[ 3 ]; + uint groupID[ 3 ]; + uint globalOffset[ 3 ]; + uint enqueuedLocalSize[ 3 ]; + } work_item_data; + +__kernel void sample_kernel( __global work_item_data *outData ) +{ + int id = get_global_id(0); + outData[ id ].workDim = (uint)get_work_dim(); + for( uint i = 0; i < get_work_dim(); i++ ) + { + outData[ id ].globalSize[ i ] = (uint)get_global_size( i ); + outData[ id ].globalID[ i ] = (uint)get_global_id( i ); + outData[ id ].localSize[ i ] = (uint)get_local_size( i ); + outData[ id ].localID[ i ] = (uint)get_local_id( i ); + outData[ id ].numGroups[ i ] = (uint)get_num_groups( i ); + outData[ id ].groupID[ i ] = (uint)get_group_id( i ); + } +})"; + +struct work_item_data_out_of_range +{ + cl_uint workDim; + cl_uint globalSize; + cl_uint globalID; + cl_uint localSize; + cl_uint localID; + cl_uint numGroups; + cl_uint groupID; + cl_uint globalOffset; + cl_uint enqueuedLocalSize; +}; + +const char *outOfRangeWorkItemKernelCode = + R"(typedef struct { + uint workDim; + uint globalSize; + uint globalID; + uint localSize; + uint localID; + uint numGroups; + uint groupID; + uint globalOffset; + uint enqueuedLocalSize; + } work_item_data; + +__kernel void sample_kernel( __global work_item_data *outData, int dim_param ) +{ + int ind_mul=1; + int ind=0; + for( uint i = 0; i < get_work_dim(); i++ ) + { + ind += (uint)get_global_id(i) * ind_mul; + ind_mul *= get_global_size(i); + } + outData[ind].workDim = (uint)get_work_dim(); + + uint dimindx=dim_param; + outData[ind].globalSize = (uint)get_global_size(dimindx); + outData[ind].globalID = (uint)get_global_id(dimindx); + outData[ind].localSize = (uint)get_local_size(dimindx); + outData[ind].localID = (uint)get_local_id(dimindx); + outData[ind].numGroups = (uint)get_num_groups(dimindx); + outData[ind].groupID = (uint)get_group_id(dimindx); +#if __OPENCL_VERSION__ >= CL_VERSION_2_0 + outData[ind].enqueuedLocalSize = (uint)get_enqueued_local_size(dimindx); + outData[ind].globalOffset = (uint)get_global_offset(dimindx); +#elif __OPENCL_VERSION__ >= CL_VERSION_1_1 + outData[ind].globalOffset = (uint)get_global_offset(dimindx); +#endif +})"; + +const char *outOfRangeWorkItemHardcodedKernelCode = + R"(typedef struct { + uint workDim; + uint globalSize; + uint globalID; + uint localSize; + uint localID; + uint numGroups; + uint groupID; + uint globalOffset; + uint enqueuedLocalSize; + } work_item_data; + +__kernel void sample_kernel( __global work_item_data *outData, int dim_param ) +{ + int ind_mul=1; + int ind=0; + for( uint i = 0; i < get_work_dim(); i++ ) + { + ind += (uint)get_global_id(i) * ind_mul; + ind_mul *= get_global_size(i); + } + outData[ind].workDim = (uint)get_work_dim(); + outData[ind].globalSize = (uint)get_global_size(4); + outData[ind].globalID = (uint)get_global_id(4); + outData[ind].localSize = (uint)get_local_size(4); + outData[ind].localID = (uint)get_local_id(4); + outData[ind].numGroups = (uint)get_num_groups(4); + outData[ind].groupID = (uint)get_group_id(4); +#if __OPENCL_VERSION__ >= CL_VERSION_2_0 + outData[ind].enqueuedLocalSize = (uint)get_enqueued_local_size(4); + outData[ind].globalOffset = (uint)get_global_offset(4); +#elif __OPENCL_VERSION__ >= CL_VERSION_1_1 + outData[ind].globalOffset = (uint)get_global_offset(4); +#endif +})"; #define NUM_TESTS 1 -int test_work_item_functions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +struct TestWorkItemFns { - int error; + TestWorkItemFns(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + : device(deviceID), context(context), queue(queue), program(nullptr), + kernel(nullptr), outData(nullptr), d_holder(gRandomSeed), + testData(10240) + {} - clProgramWrapper program; - clKernelWrapper kernel; - clMemWrapper outData; - std::vector testData(10240); - size_t threads[3], localThreads[3]; - MTdata d; - - - error = create_single_kernel_helper( context, &program, &kernel, 1, &workItemKernelCode, "sample_kernel" ); - test_error( error, "Unable to create testing kernel" ); - - outData = - clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(work_item_data) * testData.size(), NULL, &error); - test_error( error, "Unable to create output buffer" ); - - error = clSetKernelArg( kernel, 0, sizeof( outData ), &outData ); - test_error( error, "Unable to set kernel arg" ); - - d = init_genrand( gRandomSeed ); - for( size_t dim = 1; dim <= 3; dim++ ) + cl_int SetUp(const char *src) { - for( int i = 0; i < NUM_TESTS; i++ ) + cl_int error = create_single_kernel_helper(context, &program, &kernel, + 1, &src, "sample_kernel"); + test_error(error, "Unable to create testing kernel"); + + outData = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(work_item_data) * testData.size(), NULL, + &error); + test_error(error, "Unable to create output buffer"); + + error = clSetKernelArg(kernel, 0, sizeof(outData), &outData); + test_error(error, "Unable to set kernel arg"); + + return CL_SUCCESS; + } + + cl_int Run() + { + cl_int error = SetUp(workItemKernelCode); + test_error(error, "SetUp failed"); + + size_t threads[3] = { 0, 0, 0 }; + size_t localThreads[3] = { 0, 0, 0 }; + for (size_t dim = 1; dim <= 3; dim++) { - for( size_t j = 0; j < dim; j++ ) + for (int i = 0; i < NUM_TESTS; i++) { - // All of our thread sizes should be within the max local sizes, since they're all <= 20 - threads[ j ] = (size_t)random_in_range( 1, 20, d ); - localThreads[ j ] = threads[ j ] / (size_t)random_in_range( 1, (int)threads[ j ], d ); - while( localThreads[ j ] > 1 && ( threads[ j ] % localThreads[ j ] != 0 ) ) - localThreads[ j ]--; - - // Hack for now: localThreads > 1 are iffy - localThreads[ j ] = 1; - } - error = clEnqueueNDRangeKernel( queue, kernel, (cl_uint)dim, NULL, threads, localThreads, 0, NULL, NULL ); - test_error( error, "Unable to run kernel" ); - - error = - clEnqueueReadBuffer(queue, outData, CL_TRUE, 0, - sizeof(work_item_data) * testData.size(), - testData.data(), 0, NULL, NULL); - test_error( error, "Unable to read results" ); - - // Validate - for( size_t q = 0; q < threads[0]; q++ ) - { - // We can't really validate the actual value of each one, but we can validate that they're within a sane range - if( testData[ q ].workDim != (cl_uint)dim ) + for (size_t j = 0; j < dim; j++) { - log_error( "ERROR: get_work_dim() did not return proper value for %d dimensions (expected %d, got %d)\n", (int)dim, (int)dim, (int)testData[ q ].workDim ); - free_mtdata(d); - return -1; + // All of our thread sizes should be within the max local + // sizes, since they're all <= 20 + threads[j] = (size_t)random_in_range(1, 20, d_holder); + localThreads[j] = threads[j] + / (size_t)random_in_range(1, (int)threads[j], d_holder); + while (localThreads[j] > 1 + && (threads[j] % localThreads[j] != 0)) + localThreads[j]--; + + // Hack for now: localThreads > 1 are iffy + localThreads[j] = 1; } - for( size_t j = 0; j < dim; j++ ) + error = clEnqueueNDRangeKernel(queue, kernel, (cl_uint)dim, + NULL, threads, localThreads, 0, + NULL, NULL); + test_error(error, "Unable to run kernel"); + + error = clEnqueueReadBuffer(queue, outData, CL_TRUE, 0, + sizeof(work_item_data) + * testData.size(), + testData.data(), 0, NULL, NULL); + test_error(error, "Unable to read results"); + + // Validate + for (size_t q = 0; q < threads[0]; q++) { - if( testData[ q ].globalSize[ j ] != (cl_uint)threads[ j ] ) + // We can't really validate the actual value of each one, + // but we can validate that they're within a sane range + if (testData[q].workDim != (cl_uint)dim) { - log_error( "ERROR: get_global_size(%d) did not return proper value for %d dimensions (expected %d, got %d)\n", - (int)j, (int)dim, (int)threads[ j ], (int)testData[ q ].globalSize[ j ] ); - free_mtdata(d); + log_error( + "ERROR: get_work_dim() did not return proper value " + "for %d dimensions (expected %d, got %d)\n", + (int)dim, (int)dim, (int)testData[q].workDim); return -1; } - if (testData[q].globalID[j] >= (cl_uint)threads[j]) + for (size_t j = 0; j < dim; j++) { - log_error( "ERROR: get_global_id(%d) did not return proper value for %d dimensions (max %d, got %d)\n", - (int)j, (int)dim, (int)threads[ j ], (int)testData[ q ].globalID[ j ] ); - free_mtdata(d); - return -1; - } - if( testData[ q ].localSize[ j ] != (cl_uint)localThreads[ j ] ) - { - log_error( "ERROR: get_local_size(%d) did not return proper value for %d dimensions (expected %d, got %d)\n", - (int)j, (int)dim, (int)localThreads[ j ], (int)testData[ q ].localSize[ j ] ); - free_mtdata(d); - return -1; - } - if (testData[q].localID[j] >= (cl_uint)localThreads[j]) - { - log_error( "ERROR: get_local_id(%d) did not return proper value for %d dimensions (max %d, got %d)\n", - (int)j, (int)dim, (int)localThreads[ j ], (int)testData[ q ].localID[ j ] ); - free_mtdata(d); - return -1; - } - size_t groupCount = ( threads[ j ] + localThreads[ j ] - 1 ) / localThreads[ j ]; - if( testData[ q ].numGroups[ j ] != (cl_uint)groupCount ) - { - log_error( "ERROR: get_num_groups(%d) did not return proper value for %d dimensions (expected %d with global dim %d and local dim %d, got %d)\n", - (int)j, (int)dim, (int)groupCount, (int)threads[ j ], (int)localThreads[ j ], (int)testData[ q ].numGroups[ j ] ); - free_mtdata(d); - return -1; - } - if (testData[q].groupID[j] >= (cl_uint)groupCount) - { - log_error( "ERROR: get_group_id(%d) did not return proper value for %d dimensions (max %d, got %d)\n", - (int)j, (int)dim, (int)groupCount, (int)testData[ q ].groupID[ j ] ); - free_mtdata(d); - return -1; + if (testData[q].globalSize[j] != (cl_uint)threads[j]) + { + log_error("ERROR: get_global_size(%d) did not " + "return proper value for %d dimensions " + "(expected %d, got %d)\n", + (int)j, (int)dim, (int)threads[j], + (int)testData[q].globalSize[j]); + return -1; + } + if (testData[q].globalID[j] >= (cl_uint)threads[j]) + { + log_error("ERROR: get_global_id(%d) did not return " + "proper value for %d dimensions (max %d, " + "got %d)\n", + (int)j, (int)dim, (int)threads[j], + (int)testData[q].globalID[j]); + return -1; + } + if (testData[q].localSize[j] + != (cl_uint)localThreads[j]) + { + log_error("ERROR: get_local_size(%d) did not " + "return proper value for %d dimensions " + "(expected %d, got %d)\n", + (int)j, (int)dim, (int)localThreads[j], + (int)testData[q].localSize[j]); + return -1; + } + if (testData[q].localID[j] >= (cl_uint)localThreads[j]) + { + log_error( + "ERROR: get_local_id(%d) did not return proper " + "value for %d dimensions (max %d, got %d)\n", + (int)j, (int)dim, (int)localThreads[j], + (int)testData[q].localID[j]); + return -1; + } + size_t groupCount = (threads[j] + localThreads[j] - 1) + / localThreads[j]; + if (testData[q].numGroups[j] != (cl_uint)groupCount) + { + log_error("ERROR: get_num_groups(%d) did not " + "return proper value for %d dimensions " + "(expected %d with global dim %d and " + "local dim %d, got %d)\n", + (int)j, (int)dim, (int)groupCount, + (int)threads[j], (int)localThreads[j], + (int)testData[q].numGroups[j]); + return -1; + } + if (testData[q].groupID[j] >= (cl_uint)groupCount) + { + log_error( + "ERROR: get_group_id(%d) did not return proper " + "value for %d dimensions (max %d, got %d)\n", + (int)j, (int)dim, (int)groupCount, + (int)testData[q].groupID[j]); + return -1; + } } } } } + return 0; } - free_mtdata(d); - return 0; + cl_device_id device; + cl_context context; + cl_command_queue queue; + clProgramWrapper program; + clKernelWrapper kernel; + clMemWrapper outData; + MTdataHolder d_holder; + + std::vector testData; +}; + +struct TestWorkItemFnsOutOfRange +{ + size_t threads[3] = { 0, 0, 0 }; + + TestWorkItemFnsOutOfRange(cl_device_id deviceID, cl_context context, + cl_command_queue queue, const char *ksrc) + : device(deviceID), context(context), queue(queue), program(nullptr), + kernel(nullptr), outData(nullptr), d_holder(gRandomSeed), + testData(10240), max_workgroup_size(0), kernel_src(ksrc) + {} + + virtual cl_int SetUp(const char *src) + { + cl_int error = create_single_kernel_helper(context, &program, &kernel, + 1, &src, "sample_kernel"); + test_error(error, "Unable to create testing kernel"); + + outData = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(work_item_data_out_of_range) + * testData.size(), + NULL, &error); + test_error(error, "Unable to create output buffer"); + + error = clSetKernelArg(kernel, 0, sizeof(outData), &outData); + test_error(error, "Unable to set kernel arg"); + + error = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, + sizeof(size_t) * maxWorkItemSizes.size(), + maxWorkItemSizes.data(), NULL); + test_error(error, + "clDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_SIZES failed"); + + error = clGetKernelWorkGroupInfo( + kernel, device, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(max_workgroup_size), &max_workgroup_size, NULL); + test_error(error, "clGetKernelWorkgroupInfo failed."); + + return CL_SUCCESS; + } + + bool Validate(const cl_uint dim) + { + cl_uint threads_to_verify = 1; + for (size_t j = 0; j < dim; j++) threads_to_verify *= threads[j]; + + for (size_t q = 0; q < threads_to_verify; q++) + { + if (testData[q].workDim != (cl_uint)dim) + { + log_error("ERROR: get_work_dim() did not return proper value " + "for %d dimensions (expected %d, got %d)\n", + (int)dim, (int)dim, (int)testData[q].workDim); + return false; + } + if (testData[q].globalSize != 1) + { + log_error("ERROR: get_global_size(%d) did not return " + "proper value for the argument out of range " + "(expected 1, got %d)\n", + (int)dim, (int)testData[q].globalSize); + return false; + } + if (testData[q].globalID != 0) + { + log_error("ERROR: get_global_id(%d) did not return " + "proper value for the argument out of range " + "(expected 0, got %d)\n", + (int)dim, (int)testData[q].globalID); + return false; + } + if (testData[q].localSize != 1) + { + log_error("ERROR: get_local_size(%d) did not return " + "proper value for the argument out of range " + "(expected 1, got %d)\n", + (int)dim, (int)testData[q].localSize); + return false; + } + if (testData[q].localID != 0) + { + log_error("ERROR: get_local_id(%d) did not return " + "proper value for the argument out of range " + "(expected 0, got %d)\n", + (int)dim, (int)testData[q].localID); + return false; + } + if (testData[q].numGroups != 1) + { + log_error("ERROR: get_num_groups(%d) did not return " + "proper value for the argument out of range " + "(expected 1, got %d)\n", + (int)dim, (int)testData[q].numGroups); + return false; + } + if (testData[q].groupID != 0) + { + log_error("ERROR: get_group_id(%d) did not return " + "proper value for the argument out of range " + "(expected 0, got %d)\n", + (int)dim, (int)testData[q].groupID); + return false; + } + } + + const Version version = get_device_cl_version(device); + if (version >= Version(2, 0)) + { + for (size_t q = 0; q < threads_to_verify; q++) + { + if (testData[q].globalOffset != 0) + { + log_error( + "ERROR: get_global_offset(%d) did not return " + "proper value " + "for the argument out of range (expected 0, got %d)\n", + (int)dim, (int)testData[q].globalOffset); + return false; + } + if (testData[q].enqueuedLocalSize != 1) + { + log_error( + "ERROR: get_enqueued_local_size(%d) did not return " + "proper value for the argument out of range " + "(expected 1, got %d)\n", + (int)dim, (int)testData[q].globalSize); + return false; + } + } + } + else if (version >= Version(1, 1)) + { + for (size_t q = 0; q < threads_to_verify; q++) + { + if (testData[q].globalOffset != 0) + { + log_error( + "ERROR: get_global_offset(%d) did not return " + "proper value " + "for the argument out of range (expected 0, got %d)\n", + (int)dim, (int)testData[q].globalOffset); + return false; + } + } + } + + return true; + } + + cl_int Run() + { + cl_int error = SetUp(kernel_src); + test_error(error, "SetUp failed"); + + size_t localThreads[3] = { 0, 0, 0 }; + + for (size_t dim = 1; dim <= 3; dim++) + { + size_t local_workgroup_size[3] = { maxWorkItemSizes[0], + maxWorkItemSizes[1], + maxWorkItemSizes[2] }; + // check if maximum work group size for current dimention is not + // exceeded + cl_uint work_group_size = max_workgroup_size + 1; + while (max_workgroup_size < work_group_size && work_group_size != 1) + { + work_group_size = 1; + for (size_t j = 0; j < dim; j++) + work_group_size *= local_workgroup_size[j]; + if (max_workgroup_size < work_group_size) + { + for (size_t j = 0; j < dim; j++) + local_workgroup_size[j] = + std::max(1, (int)local_workgroup_size[j] / 2); + } + }; + + // compute max number of work groups based on buffer size and max + // group size + cl_uint max_work_groups = testData.size() / work_group_size; + // take into account number of dimentions + cl_uint work_groups_per_dim = + std::max(1, (int)pow(max_work_groups, 1.f / dim)); + + for (size_t j = 0; j < dim; j++) + { + // generate ranges for uniform work group size + localThreads[j] = + random_in_range(1, (int)local_workgroup_size[j], d_holder); + size_t num_groups = + (size_t)random_in_range(1, work_groups_per_dim, d_holder); + threads[j] = num_groups * localThreads[j]; + } + + cl_int dim_param = dim + 1; + error = clSetKernelArg(kernel, 1, sizeof(cl_int), &dim_param); + test_error(error, "Unable to set kernel arg"); + + error = + clEnqueueNDRangeKernel(queue, kernel, (cl_uint)dim, NULL, + threads, localThreads, 0, NULL, NULL); + test_error(error, "Unable to run kernel"); + + error = clEnqueueReadBuffer(queue, outData, CL_TRUE, 0, + sizeof(work_item_data_out_of_range) + * testData.size(), + testData.data(), 0, NULL, NULL); + test_error(error, "Unable to read results"); + + // Validate + if (!Validate(dim)) + { + log_error("Validation failed"); + return TEST_FAIL; + } + } + return TEST_PASS; + } + + cl_device_id device; + cl_context context; + cl_command_queue queue; + clProgramWrapper program; + clKernelWrapper kernel; + clMemWrapper outData; + MTdataHolder d_holder; + + std::vector testData; + + std::array maxWorkItemSizes; + size_t max_workgroup_size; + + const char *kernel_src; +}; + +} // anonymous namespace + +int test_work_item_functions(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + TestWorkItemFns fnct(deviceID, context, queue); + return fnct.Run(); } +int test_work_item_functions_out_of_range(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + TestWorkItemFnsOutOfRange fnct(deviceID, context, queue, + outOfRangeWorkItemKernelCode); + return fnct.Run(); +} +int test_work_item_functions_out_of_range_hardcoded(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + TestWorkItemFnsOutOfRange fnct(deviceID, context, queue, + outOfRangeWorkItemHardcodedKernelCode); + return fnct.Run(); +}