diff --git a/test_conformance/thread_dimensions/main.cpp b/test_conformance/thread_dimensions/main.cpp index 9a1ce609..236d7731 100644 --- a/test_conformance/thread_dimensions/main.cpp +++ b/test_conformance/thread_dimensions/main.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -19,25 +19,74 @@ #include #include "procs.h" +// Additional parameters to limit test scope (-n,-b,-x) +cl_uint maxThreadDimension = 0; +cl_uint bufferSize = 0; +cl_uint bufferStep = 0; + test_definition test_list[] = { - ADD_TEST( quick_1d_explicit_local ), - ADD_TEST( quick_2d_explicit_local ), - ADD_TEST( quick_3d_explicit_local ), - ADD_TEST( quick_1d_implicit_local ), - ADD_TEST( quick_2d_implicit_local ), - ADD_TEST( quick_3d_implicit_local ), - ADD_TEST( full_1d_explicit_local ), - ADD_TEST( full_2d_explicit_local ), - ADD_TEST( full_3d_explicit_local ), - ADD_TEST( full_1d_implicit_local ), - ADD_TEST( full_2d_implicit_local ), - ADD_TEST( full_3d_implicit_local ), + ADD_TEST(quick_1d_explicit_local), ADD_TEST(quick_2d_explicit_local), + ADD_TEST(quick_3d_explicit_local), ADD_TEST(quick_1d_implicit_local), + ADD_TEST(quick_2d_implicit_local), ADD_TEST(quick_3d_implicit_local), + ADD_TEST(full_1d_explicit_local), ADD_TEST(full_2d_explicit_local), + ADD_TEST(full_3d_explicit_local), ADD_TEST(full_1d_implicit_local), + ADD_TEST(full_2d_implicit_local), ADD_TEST(full_3d_implicit_local), }; -const int test_num = ARRAY_SIZE( test_list ); +const int test_num = ARRAY_SIZE(test_list); int main(int argc, const char *argv[]) { + int delArg = 0; + for (auto i = 0; i < argc; i++) + { + delArg = 0; + + if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) + { + log_info("Thread dimensions options:\n"); + log_info("\t-n\tMaximum thread dimension value\n"); + log_info("\t-b\tSpecifies a buffer size for calculations\n"); + log_info("\t-x\tSpecifies a step for calculations\n"); + } + if (strcmp(argv[i], "-n") == 0) + { + delArg++; + if (atoi(argv[i + 1]) < 1) + { + log_info("ERROR: -n Maximum thread dimension value must be " + "greater than 0"); + return TEST_FAIL; + } + maxThreadDimension = atoi(argv[i + 1]); + delArg++; + } + if (strcmp(argv[i], "-b") == 0) + { + delArg++; + if (atoi(argv[i + 1]) < 1) + { + log_info("ERROR: -b Buffer size must be greater than 0"); + return TEST_FAIL; + } + bufferSize = atoi(argv[i + 1]); + delArg++; + } + if (strcmp(argv[i], "-x") == 0) + { + delArg++; + if (atoi(argv[i + 1]) < 1) + { + log_info("ERROR: -x Buffer step must be greater than 0"); + return TEST_FAIL; + } + bufferStep = atoi(argv[i + 1]); + delArg++; + } + for (int j = i; j < argc - delArg; j++) argv[j] = argv[j + delArg]; + argc -= delArg; + i -= delArg; + } + return runTestHarness(argc, argv, test_num, test_list, false, 0); } - diff --git a/test_conformance/thread_dimensions/procs.h b/test_conformance/thread_dimensions/procs.h index d01d3c50..261d8bf2 100644 --- a/test_conformance/thread_dimensions/procs.h +++ b/test_conformance/thread_dimensions/procs.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -21,17 +21,52 @@ extern const int kVectorSizeCount; -extern int test_quick_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_quick_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_quick_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_quick_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_quick_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_quick_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); - -extern int test_full_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_full_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_full_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_full_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_full_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_full_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); +extern int test_quick_1d_explicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_quick_2d_explicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_quick_3d_explicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_quick_1d_implicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_quick_2d_implicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_quick_3d_implicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_full_1d_explicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_full_2d_explicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_full_3d_explicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_full_1d_implicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_full_2d_implicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_full_3d_implicit_local(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); diff --git a/test_conformance/thread_dimensions/test_thread_dimensions.cpp b/test_conformance/thread_dimensions/test_thread_dimensions.cpp index fa8e2847..8eec15c1 100644 --- a/test_conformance/thread_dimensions/test_thread_dimensions.cpp +++ b/test_conformance/thread_dimensions/test_thread_dimensions.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -25,38 +25,46 @@ #define ITERATIONS 4 #define DEBUG 0 -// If the environment variable DO_NOT_LIMIT_THREAD_SIZE is not set, the test will limit the maximum total -// global dimensions tested to this value. -#define MAX_TOTAL_GLOBAL_THREADS_FOR_TEST (1<<24) +// If the environment variable DO_NOT_LIMIT_THREAD_SIZE is not set, the test +// will limit the maximum total global dimensions tested to this value. +#define MAX_TOTAL_GLOBAL_THREADS_FOR_TEST (1 << 24) int limit_size = 0; -static int -get_maximums(cl_kernel kernel, cl_context context, - size_t *max_workgroup_size_result, - cl_ulong *max_allcoation_result, - cl_ulong *max_physical_result) { +extern cl_uint maxThreadDimension; +extern cl_uint bufferSize; +extern cl_uint bufferStep; + +static int get_maximums(cl_kernel kernel, cl_context context, + size_t *max_workgroup_size_result, + cl_ulong *max_allcoation_result, + cl_ulong *max_physical_result) +{ int err = 0; cl_uint i; cl_device_id *devices; // Get all the devices in the device group size_t num_devices_returned; - err = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &num_devices_returned); - if(err != CL_SUCCESS) + err = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, + &num_devices_returned); + if (err != CL_SUCCESS) { log_error("clGetContextInfo() failed (%d).\n", err); return -10; } devices = (cl_device_id *)malloc(num_devices_returned); - err = clGetContextInfo(context, CL_CONTEXT_DEVICES, num_devices_returned, devices, NULL); - if(err != CL_SUCCESS) + err = clGetContextInfo(context, CL_CONTEXT_DEVICES, num_devices_returned, + devices, NULL); + if (err != CL_SUCCESS) { log_error("clGetContextInfo() failed (%d).\n", err); return -10; } num_devices_returned /= sizeof(cl_device_id); - if (num_devices_returned > 1) log_info("%d devices in device group.\n", (int)num_devices_returned); - if (num_devices_returned < 1) { + if (num_devices_returned > 1) + log_info("%d devices in device group.\n", (int)num_devices_returned); + if (num_devices_returned < 1) + { log_error("0 devices found for this kernel.\n"); return -1; } @@ -69,12 +77,16 @@ get_maximums(cl_kernel kernel, cl_context context, cl_ulong max_physical = 0; cl_ulong current_physical = 0; - for (i=0; i= final_x_size)\n" -" error = 64;\n" -" if (get_global_id(1) >= final_y_size)\n" -" error = 128;\n" -" if (get_global_id(2) >= final_z_size)\n" -" error = 256;\n" -"\n" -" unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n" -" (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n" -" if ((t_address >= start_address) && (t_address < end_address))\n" -" atom_add(&dst[t_address-start_address], 1u);\n" -" if (error)\n" -" atom_or(&dst[t_address-start_address], error);\n" -"\n" -"}\n"; + "\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "__kernel void test_thread_dimension_atomic(__global uint *dst, \n" + " uint final_x_size, uint final_y_size, uint final_z_size,\n" + " ulong start_address, ulong end_address)\n" + "{\n" + " uint error = 0;\n" + " if (get_global_id(0) >= final_x_size)\n" + " error = 64;\n" + " if (get_global_id(1) >= final_y_size)\n" + " error = 128;\n" + " if (get_global_id(2) >= final_z_size)\n" + " error = 256;\n" + "\n" + " unsigned long t_address = (unsigned " + "long)get_global_id(2)*(unsigned long)final_y_size*(unsigned " + "long)final_x_size + \n" + " (unsigned long)get_global_id(1)*(unsigned " + "long)final_x_size + (unsigned long)get_global_id(0);\n" + " if ((t_address >= start_address) && (t_address < end_address))\n" + " atom_add(&dst[t_address-start_address], 1u);\n" + " if (error)\n" + " atom_or(&dst[t_address-start_address], error);\n" + "\n" + "}\n"; static const char *thread_dimension_kernel_code_not_atomic_long = -"\n" -"__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n" -" uint final_x_size, uint final_y_size, uint final_z_size,\n" -" ulong start_address, ulong end_address)\n" -"{\n" -" uint error = 0;\n" -" if (get_global_id(0) >= final_x_size)\n" -" error = 64;\n" -" if (get_global_id(1) >= final_y_size)\n" -" error = 128;\n" -" if (get_global_id(2) >= final_z_size)\n" -" error = 256;\n" -"\n" -" unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n" -" (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n" -" if ((t_address >= start_address) && (t_address < end_address))\n" -" dst[t_address-start_address]++;\n" -" if (error)\n" -" dst[t_address-start_address]|=error;\n" -"\n" -"}\n"; + "\n" + "__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n" + " uint final_x_size, uint final_y_size, uint final_z_size,\n" + " ulong start_address, ulong end_address)\n" + "{\n" + " uint error = 0;\n" + " if (get_global_id(0) >= final_x_size)\n" + " error = 64;\n" + " if (get_global_id(1) >= final_y_size)\n" + " error = 128;\n" + " if (get_global_id(2) >= final_z_size)\n" + " error = 256;\n" + "\n" + " unsigned long t_address = (unsigned " + "long)get_global_id(2)*(unsigned long)final_y_size*(unsigned " + "long)final_x_size + \n" + " (unsigned long)get_global_id(1)*(unsigned " + "long)final_x_size + (unsigned long)get_global_id(0);\n" + " if ((t_address >= start_address) && (t_address < end_address))\n" + " dst[t_address-start_address]++;\n" + " if (error)\n" + " dst[t_address-start_address]|=error;\n" + "\n" + "}\n"; static const char *thread_dimension_kernel_code_atomic_not_long = -"\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" -"__kernel void test_thread_dimension_atomic(__global uint *dst, \n" -" uint final_x_size, uint final_y_size, uint final_z_size,\n" -" uint start_address, uint end_address)\n" -"{\n" -" uint error = 0;\n" -" if (get_global_id(0) >= final_x_size)\n" -" error = 64;\n" -" if (get_global_id(1) >= final_y_size)\n" -" error = 128;\n" -" if (get_global_id(2) >= final_z_size)\n" -" error = 256;\n" -"\n" -" unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_size + \n" -" (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n" -" if ((t_address >= start_address) && (t_address < end_address))\n" -" atom_add(&dst[t_address-start_address], 1u);\n" -" if (error)\n" -" atom_or(&dst[t_address-start_address], error);\n" -"\n" -"}\n"; + "\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n" + "__kernel void test_thread_dimension_atomic(__global uint *dst, \n" + " uint final_x_size, uint final_y_size, uint final_z_size,\n" + " uint start_address, uint end_address)\n" + "{\n" + " uint error = 0;\n" + " if (get_global_id(0) >= final_x_size)\n" + " error = 64;\n" + " if (get_global_id(1) >= final_y_size)\n" + " error = 128;\n" + " if (get_global_id(2) >= final_z_size)\n" + " error = 256;\n" + "\n" + " unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned " + "int)final_y_size*(unsigned int)final_x_size + \n" + " (unsigned int)get_global_id(1)*(unsigned int)final_x_size " + "+ (unsigned int)get_global_id(0);\n" + " if ((t_address >= start_address) && (t_address < end_address))\n" + " atom_add(&dst[t_address-start_address], 1u);\n" + " if (error)\n" + " atom_or(&dst[t_address-start_address], error);\n" + "\n" + "}\n"; static const char *thread_dimension_kernel_code_not_atomic_not_long = -"\n" -"__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n" -" uint final_x_size, uint final_y_size, uint final_z_size,\n" -" uint start_address, uint end_address)\n" -"{\n" -" uint error = 0;\n" -" if (get_global_id(0) >= final_x_size)\n" -" error = 64;\n" -" if (get_global_id(1) >= final_y_size)\n" -" error = 128;\n" -" if (get_global_id(2) >= final_z_size)\n" -" error = 256;\n" -"\n" -" unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_size + \n" -" (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n" -" if ((t_address >= start_address) && (t_address < end_address))\n" -" dst[t_address-start_address]++;\n" -" if (error)\n" -" dst[t_address-start_address]|=error;\n" -"\n" -"}\n"; + "\n" + "__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n" + " uint final_x_size, uint final_y_size, uint final_z_size,\n" + " uint start_address, uint end_address)\n" + "{\n" + " uint error = 0;\n" + " if (get_global_id(0) >= final_x_size)\n" + " error = 64;\n" + " if (get_global_id(1) >= final_y_size)\n" + " error = 128;\n" + " if (get_global_id(2) >= final_z_size)\n" + " error = 256;\n" + "\n" + " unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned " + "int)final_y_size*(unsigned int)final_x_size + \n" + " (unsigned int)get_global_id(1)*(unsigned int)final_x_size " + "+ (unsigned int)get_global_id(0);\n" + " if ((t_address >= start_address) && (t_address < end_address))\n" + " dst[t_address-start_address]++;\n" + " if (error)\n" + " dst[t_address-start_address]|=error;\n" + "\n" + "}\n"; char dim_str[128]; -char * -print_dimensions(size_t x, size_t y, size_t z, cl_uint dim) { +char *print_dimensions(size_t x, size_t y, size_t z, cl_uint dim) +{ // Not thread safe... - if (dim == 1) { + if (dim == 1) + { snprintf(dim_str, 128, "[%d]", (int)x); - } else if (dim == 2) { + } + else if (dim == 2) + { snprintf(dim_str, 128, "[%d x %d]", (int)x, (int)y); - } else if (dim == 3) { + } + else if (dim == 3) + { snprintf(dim_str, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z); - } else { + } + else + { snprintf(dim_str, 128, "INVALID DIM: %d", dim); } return dim_str; } char dim_str2[128]; -char * -print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim) { +char *print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim) +{ // Not thread safe... - if (dim == 1) { + if (dim == 1) + { snprintf(dim_str2, 128, "[%d]", (int)x); - } else if (dim == 2) { + } + else if (dim == 2) + { snprintf(dim_str2, 128, "[%d x %d]", (int)x, (int)y); - } else if (dim == 3) { + } + else if (dim == 3) + { snprintf(dim_str2, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z); - } else { + } + else + { snprintf(dim_str2, 128, "INVALID DIM: %d", dim); } return dim_str2; @@ -246,57 +293,64 @@ print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim) { /* - This tests thread dimensions by executing a kernel across a range of dimensions. - Each kernel instance does an atomic write into a specific location in a buffer to - ensure that the correct dimensions are run. To handle large dimensions, the kernel - masks its execution region internally. This allows a small (128MB) buffer to be used - for very large executions by running the kernel multiple times. + This tests thread dimensions by executing a kernel across a range of + dimensions. Each kernel instance does an atomic write into a specific location + in a buffer to ensure that the correct dimensions are run. To handle large + dimensions, the kernel masks its execution region internally. This allows a + small (128MB) buffer to be used for very large executions by running the kernel + multiple times. */ -int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, cl_mem array, cl_uint memory_size, cl_uint dimensions, +int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, + cl_mem array, cl_uint memory_size, cl_uint dimensions, cl_uint final_x_size, cl_uint final_y_size, cl_uint final_z_size, cl_uint local_x_size, cl_uint local_y_size, cl_uint local_z_size, int explict_local) { cl_uint errors = 0; size_t global_size[3], local_size[3]; - global_size[0] = final_x_size; local_size[0] = local_x_size; - global_size[1] = final_y_size; local_size[1] = local_y_size; - global_size[2] = final_z_size; local_size[2] = local_z_size; + global_size[0] = final_x_size; + local_size[0] = local_x_size; + global_size[1] = final_y_size; + local_size[1] = local_y_size; + global_size[2] = final_z_size; + local_size[2] = local_z_size; cl_ulong start_valid_memory_address = 0; cl_ulong end_valid_memory_address = memory_size; - cl_ulong last_memory_address = (cl_ulong)final_x_size*(cl_ulong)final_y_size*(cl_ulong)final_z_size*sizeof(cl_uint); + cl_ulong last_memory_address = (cl_ulong)final_x_size + * (cl_ulong)final_y_size * (cl_ulong)final_z_size * sizeof(cl_uint); if (end_valid_memory_address > last_memory_address) end_valid_memory_address = last_memory_address; - int number_of_iterations_required = (int)ceil((double)last_memory_address/(double)memory_size); - log_info("\t\tTest requires %gMB (%d test iterations using an allocation of %gMB).\n", - (double)last_memory_address/(1024.0*1024.0), number_of_iterations_required, (double)memory_size/(1024.0*1024.0)); - //log_info("Last memory address: %llu, memory_size: %llu\n", last_memory_address, memory_size); + int number_of_iterations_required = + (int)ceil((double)last_memory_address / (double)memory_size); + log_info("\t\tTest requires %gMB (%d test iterations using an allocation " + "of %gMB).\n", + (double)last_memory_address / (1024.0 * 1024.0), + number_of_iterations_required, + (double)memory_size / (1024.0 * 1024.0)); + // log_info("Last memory address: %llu, memory_size: %llu\n", + // last_memory_address, memory_size); while (end_valid_memory_address <= last_memory_address) { int err; const int fill_pattern = 0x0; - err = clEnqueueFillBuffer(queue, - array, - (void*)&fill_pattern, - sizeof(fill_pattern), - 0, - memory_size, - 0, - NULL, + err = clEnqueueFillBuffer(queue, array, (void *)&fill_pattern, + sizeof(fill_pattern), 0, memory_size, 0, NULL, NULL); - if (err != CL_SUCCESS) { - print_error( err, "Failed to set fill buffer."); + if (err != CL_SUCCESS) + { + print_error(err, "Failed to set fill buffer."); return -3; } - cl_ulong start_valid_index = start_valid_memory_address/sizeof(cl_uint); - cl_ulong end_valid_index = end_valid_memory_address/sizeof(cl_uint); + cl_ulong start_valid_index = + start_valid_memory_address / sizeof(cl_uint); + cl_ulong end_valid_index = end_valid_memory_address / sizeof(cl_uint); - cl_uint start_valid_index_int = (cl_uint) start_valid_index; - cl_uint end_valid_index_int = (cl_uint) end_valid_index; + cl_uint start_valid_index_int = (cl_uint)start_valid_index; + cl_uint end_valid_index_int = (cl_uint)end_valid_index; // Set the arguments err = clSetKernelArg(kernel, 0, sizeof(array), &array); @@ -305,115 +359,149 @@ int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, cl_me err |= clSetKernelArg(kernel, 3, sizeof(final_z_size), &final_z_size); if (gHasLong) { - err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index), &start_valid_index); - err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index), &end_valid_index); + err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index), + &start_valid_index); + err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index), + &end_valid_index); } else { - err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index_int), &start_valid_index_int); - err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index_int), &end_valid_index_int); + err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index_int), + &start_valid_index_int); + err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index_int), + &end_valid_index_int); } - if (err != CL_SUCCESS) { - print_error( err, "Failed to set arguments."); + if (err != CL_SUCCESS) + { + print_error(err, "Failed to set arguments."); return -3; } // Execute the kernel - if (explict_local == 0) { - err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, NULL, 0, NULL, NULL); - if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, NULL local, %d dim, start address %llu, end address %llu.\n", - print_dimensions(global_size[0], global_size[1], global_size[2], dimensions), - dimensions, start_valid_memory_address, end_valid_memory_address); - } else { - err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, local_size, 0, NULL, NULL); - if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, local %s, %d dim, start address %llu, end address %llu.\n", - print_dimensions(global_size[0], global_size[1], global_size[2], dimensions), print_dimensions2(local_size[0], local_size[1], local_size[2], dimensions), - dimensions, start_valid_memory_address, end_valid_memory_address); + if (explict_local == 0) + { + err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, + global_size, NULL, 0, NULL, NULL); + if (DEBUG) + log_info("\t\t\tExecuting kernel with global %s, NULL local, " + "%d dim, start address %llu, end address %llu.\n", + print_dimensions(global_size[0], global_size[1], + global_size[2], dimensions), + dimensions, start_valid_memory_address, + end_valid_memory_address); } - if (err == CL_OUT_OF_RESOURCES) { - log_info("WARNING: kernel reported CL_OUT_OF_RESOURCES, indicating the global dimensions are too large. Skipping this size.\n"); + else + { + err = + clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, + global_size, local_size, 0, NULL, NULL); + if (DEBUG) + log_info("\t\t\tExecuting kernel with global %s, local %s, %d " + "dim, start address %llu, end address %llu.\n", + print_dimensions(global_size[0], global_size[1], + global_size[2], dimensions), + print_dimensions2(local_size[0], local_size[1], + local_size[2], dimensions), + dimensions, start_valid_memory_address, + end_valid_memory_address); + } + if (err == CL_OUT_OF_RESOURCES) + { + log_info( + "WARNING: kernel reported CL_OUT_OF_RESOURCES, indicating the " + "global dimensions are too large. Skipping this size.\n"); return 0; } - if (err != CL_SUCCESS) { - print_error( err, "Failed to execute kernel\n"); + if (err != CL_SUCCESS) + { + print_error(err, "Failed to execute kernel\n"); return -3; } - void* mapped = clEnqueueMapBuffer(queue, array, CL_TRUE, CL_MAP_READ, 0, memory_size, 0, NULL, NULL, &err ); - if (err != CL_SUCCESS) { - print_error( err, "Failed to map results\n"); + void *mapped = clEnqueueMapBuffer(queue, array, CL_TRUE, CL_MAP_READ, 0, + memory_size, 0, NULL, NULL, &err); + if (err != CL_SUCCESS) + { + print_error(err, "Failed to map results\n"); return -4; } - cl_uint* data = (cl_uint*)mapped; + cl_uint *data = (cl_uint *)mapped; // Verify the data cl_uint i; - cl_uint last_address = (cl_uint)(end_valid_memory_address - start_valid_memory_address)/(cl_uint)sizeof(cl_uint); - for (i=0; i last_memory_address) end_valid_memory_address = last_memory_address; } - if (errors) - log_error("%d errors.\n", errors); + if (errors) log_error("%d errors.\n", errors); return errors; } +static cl_uint max_x_size = 1, min_x_size = 1, max_y_size = 1, min_y_size = 1, + max_z_size = 1, min_z_size = 1; - -static cl_uint max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1; - -static void set_min(cl_uint *x, cl_uint *y, cl_uint *z) { - if (*x < min_x_size) - *x = min_x_size; - if (*y < min_y_size) - *y = min_y_size; - if (*z < min_z_size) - *z = min_z_size; - if (*x > max_x_size) - *x = max_x_size; - if (*y > max_y_size) - *y = max_y_size; - if (*z > max_z_size) - *z = max_z_size; +static void set_min(cl_uint *x, cl_uint *y, cl_uint *z) +{ + if (*x < min_x_size) *x = min_x_size; + if (*y < min_y_size) *y = min_y_size; + if (*z < min_z_size) *z = min_z_size; + if (*x > max_x_size) *x = max_x_size; + if (*y > max_y_size) *y = max_y_size; + if (*z > max_z_size) *z = max_z_size; } -int -test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue queue, cl_uint dimensions, cl_uint min_dim, cl_uint max_dim, cl_uint quick_test, cl_uint size_increase_per_iteration, int explicit_local) { +int test_thread_dimensions(cl_device_id device, cl_context context, + cl_command_queue queue, cl_uint dimensions, + cl_uint min_dim, cl_uint max_dim, cl_uint quick_test, + cl_uint size_increase_per_iteration, + int explicit_local) +{ cl_mem array; cl_program program; cl_kernel kernel; @@ -424,9 +512,10 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue int use_atomics = 1; MTdata d; - if (getenv("CL_WIMPY_MODE") && !quick_test) { - log_info("CL_WIMPY_MODE enabled, skipping test\n"); - return 0; + if (getenv("CL_WIMPY_MODE") && !quick_test) + { + log_info("CL_WIMPY_MODE enabled, skipping test\n"); + return 0; } // Unconditionally test larger sizes for CL 1.1 @@ -434,41 +523,74 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue limit_size = 0; /* Check if atomics are supported. */ - if (!is_extension_available(device, "cl_khr_global_int32_base_atomics")) { - log_info("WARNING: Base atomics not supported (cl_khr_global_int32_base_atomics). Test will not be guaranteed to catch overlaping thread dimensions.\n"); + if (!is_extension_available(device, "cl_khr_global_int32_base_atomics")) + { + log_info("WARNING: Base atomics not supported " + "(cl_khr_global_int32_base_atomics). Test will not be " + "guaranteed to catch overlaping thread dimensions.\n"); use_atomics = 0; } if (quick_test) - log_info("WARNING: Running quick test. This will only test the base dimensions (power of two) and base-1 with all local threads fixed in one dim.\n"); + log_info("WARNING: Running quick test. This will only test the base " + "dimensions (power of two) and base-1 with all local threads " + "fixed in one dim.\n"); // Verify that we can test this many dimensions - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(device_max_dimensions), &device_max_dimensions, NULL); - test_error(err, "clGetDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed"); + err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, + sizeof(device_max_dimensions), &device_max_dimensions, + NULL); + test_error(err, + "clGetDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed"); - if (dimensions > device_max_dimensions) { - log_info("Can not test %d dimensions when device only supports %d.\n", dimensions, device_max_dimensions); + if (dimensions > device_max_dimensions) + { + log_info("Can not test %d dimensions when device only supports %d.\n", + dimensions, device_max_dimensions); return 0; } log_info("Setting random seed to 0.\n"); - if (gHasLong) { - if (use_atomics) { - err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_long, "test_thread_dimension_atomic" ); - } else { - err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_long, "test_thread_dimension_not_atomic" ); + if (gHasLong) + { + if (use_atomics) + { + err = create_single_kernel_helper( + context, &program, &kernel, 1, + &thread_dimension_kernel_code_atomic_long, + "test_thread_dimension_atomic"); } - } else { - if (use_atomics) { - err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_not_long, "test_thread_dimension_atomic" ); - } else { - err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_not_long, "test_thread_dimension_not_atomic" ); + else + { + err = create_single_kernel_helper( + context, &program, &kernel, 1, + &thread_dimension_kernel_code_not_atomic_long, + "test_thread_dimension_not_atomic"); } } - test_error( err, "Unable to create testing kernel" ); + else + { + if (use_atomics) + { + err = create_single_kernel_helper( + context, &program, &kernel, 1, + &thread_dimension_kernel_code_atomic_not_long, + "test_thread_dimension_atomic"); + } + else + { + err = create_single_kernel_helper( + context, &program, &kernel, 1, + &thread_dimension_kernel_code_not_atomic_not_long, + "test_thread_dimension_not_atomic"); + } + } + test_error(err, "Unable to create testing kernel"); - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL); + err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, + sizeof(max_local_workgroup_size), + max_local_workgroup_size, NULL); test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES"); // Get the maximum sizes supported by this device @@ -477,50 +599,68 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue cl_ulong max_physical = 0; int found_size = 0; - err = get_maximums(kernel, context, - &max_workgroup_size, &max_allocation, &max_physical); + err = get_maximums(kernel, context, &max_workgroup_size, &max_allocation, + &max_physical); - // Make sure we don't try to allocate more than half the physical memory present. - if (max_allocation > (max_physical/2)) { - log_info("Limiting max allocation to half of the maximum physical memory (%gMB of %gMB physical).\n", - (max_physical/2/(1024.0*1024.0)), (max_physical/(1024.0*1024.0))); - max_allocation = max_physical/2; + // Make sure we don't try to allocate more than half the physical memory + // present. + if (max_allocation > (max_physical / 2)) + { + log_info("Limiting max allocation to half of the maximum physical " + "memory (%gMB of %gMB physical).\n", + (max_physical / 2 / (1024.0 * 1024.0)), + (max_physical / (1024.0 * 1024.0))); + max_allocation = max_physical / 2; } // Limit the maximum we'll allocate for this test to 512 to be reasonable. - if (max_allocation > 1024*1024*512) { - log_info("Limiting max allocation to 512MB from device maximum allocation of %gMB.\n", (max_allocation/1024.0/1024.0)); - max_allocation = 1024*1024*512; + if (max_allocation > 1024 * 1024 * 512) + { + log_info("Limiting max allocation to 512MB from device maximum " + "allocation of %gMB.\n", + (max_allocation / 1024.0 / 1024.0)); + max_allocation = 1024 * 1024 * 512; } - max_memory_size = (cl_uint)(max_allocation); - if (max_memory_size > 512*1024*1024) - max_memory_size = 512*1024*1024; + max_memory_size = bufferSize ? bufferSize : (cl_uint)(max_allocation); + if (max_memory_size > 512 * 1024 * 1024) + max_memory_size = 512 * 1024 * 1024; memory_size = max_memory_size; - log_info("Memory allocation size to use is %gMB, max workgroup size is %d.\n", max_memory_size/(1024.0*1024.0), (int)max_workgroup_size); + log_info( + "Memory allocation size to use is %gMB, max workgroup size is %d.\n", + max_memory_size / (1024.0 * 1024.0), (int)max_workgroup_size); - while (!found_size && memory_size >= max_memory_size/8) { + while (!found_size && memory_size >= max_memory_size / 8) + { array = clCreateBuffer(context, CL_MEM_READ_WRITE, memory_size, NULL, &err); - if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE || err == CL_OUT_OF_HOST_MEMORY) { - memory_size -= max_memory_size/16; + if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE + || err == CL_OUT_OF_HOST_MEMORY) + { + memory_size -= max_memory_size / 16; continue; } - if (err) { - print_error( err, "clCreateBuffer failed"); + if (err) + { + print_error(err, "clCreateBuffer failed"); return -1; } found_size = 1; } - if (!found_size) { - log_error("Failed to find a working size greater than 1/8th of the reported allocation size.\n"); + if (!found_size) + { + log_error("Failed to find a working size greater than 1/8th of the " + "reported allocation size.\n"); return -1; } - if (memory_size < max_memory_size) { - log_info("Note: failed to allocate %gMB, using %gMB instead.\n", max_memory_size/(1024.0*1024.0), memory_size/(1024.0*1024.0)); + if (memory_size < max_memory_size) + { + log_info("Note: failed to allocate %gMB, using %gMB instead.\n", + max_memory_size / (1024.0 * 1024.0), + memory_size / (1024.0 * 1024.0)); } int errors = 0; @@ -530,171 +670,290 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue // 2 tests with each dimensions +/- 1 // 2 tests with all dimensions +/- 1 // 2 random tests - cl_uint tests_per_size = 1 + 2*dimensions + 2 + 2; + cl_uint tests_per_size = 1 + 2 * dimensions + 2 + 2; // 1 test with 1 as the local threads in each dimensions // 1 test with all the local threads in each dimension // 2 random tests cl_uint local_tests_per_size = 1 + dimensions + 2; - if (explicit_local == 0) - local_tests_per_size = 1; + if (explicit_local == 0) local_tests_per_size = 1; - max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1; + max_x_size = 1, min_x_size = 1, max_y_size = 1, min_y_size = 1, + max_z_size = 1, min_z_size = 1; - if (dimensions > 3) { + if (dimensions > 3) + { log_error("Invalid dimensions: %d\n", dimensions); return -1; } max_x_size = max_dim; min_x_size = min_dim; - if (dimensions > 1) { + if (dimensions > 1) + { max_y_size = max_dim; min_y_size = min_dim; } - if (dimensions > 2) { + if (dimensions > 2) + { max_z_size = max_dim; min_z_size = min_dim; } - log_info("Testing with dimensions up to %s.\n", print_dimensions(max_x_size, max_y_size, max_z_size, dimensions)); + log_info("Testing with dimensions up to %s.\n", + print_dimensions(max_x_size, max_y_size, max_z_size, dimensions)); + if (bufferSize) + { + log_info("Testing with buffer size %d.\n", bufferSize); + } + if (bufferStep) + { + log_info("Testing with buffer step %d.\n", bufferStep); + } cl_uint x_size, y_size, z_size; - d = init_genrand( gRandomSeed ); + d = init_genrand(gRandomSeed); z_size = min_z_size; - while (z_size <= max_z_size) { + while (z_size <= max_z_size) + { y_size = min_y_size; - while (y_size <= max_y_size) { + while (y_size <= max_y_size) + { x_size = min_x_size; - while (x_size <= max_x_size) { + while (x_size <= max_x_size) + { - log_info("Base test size %s:\n", print_dimensions(x_size, y_size, z_size, dimensions)); + log_info("Base test size %s:\n", + print_dimensions(x_size, y_size, z_size, dimensions)); cl_uint sub_test; cl_uint final_x_size, final_y_size, final_z_size; - for (sub_test = 0; sub_test < tests_per_size; sub_test++) { + for (sub_test = 0; sub_test < tests_per_size; sub_test++) + { final_x_size = x_size; final_y_size = y_size; final_z_size = z_size; - if (sub_test == 0) { - if (DEBUG) log_info("\tTesting with base dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions)); - } else if (quick_test) { - // If we are in quick mode just do 1 run with x-1, y-1, and z-1. - if (sub_test > 1) - break; + if (sub_test == 0) + { + if (DEBUG) + log_info( + "\tTesting with base dimensions %s.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions)); + } + else if (quick_test) + { + // If we are in quick mode just do 1 run with x-1, y-1, + // and z-1. + if (sub_test > 1) break; final_x_size--; final_y_size--; final_z_size--; set_min(&final_x_size, &final_y_size, &final_z_size); - if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions)); - } else if (sub_test <= dimensions*2) { - int dim_to_change = (sub_test-1)%dimensions; - //log_info ("dim_to_change: %d (sub_test:%d) dimensions %d\n", dim_to_change,sub_test, dimensions); + if (DEBUG) + log_info( + "\tTesting with all base dimensions - 1 %s.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions)); + } + else if (sub_test <= dimensions * 2) + { + int dim_to_change = (sub_test - 1) % dimensions; + // log_info ("dim_to_change: %d (sub_test:%d) dimensions + // %d\n", dim_to_change,sub_test, dimensions); int up_down = (sub_test > dimensions) ? 0 : 1; - if (dim_to_change == 0) { + if (dim_to_change == 0) + { final_x_size += (up_down) ? -1 : +1; - } else if (dim_to_change == 1) { + } + else if (dim_to_change == 1) + { final_y_size += (up_down) ? -1 : +1; - } else if (dim_to_change == 2) { + } + else if (dim_to_change == 2) + { final_z_size += (up_down) ? -1 : +1; - } else { - log_error("Invalid dim_to_change: %d\n", dim_to_change); + } + else + { + log_error("Invalid dim_to_change: %d\n", + dim_to_change); return -1; } set_min(&final_x_size, &final_y_size, &final_z_size); - if (DEBUG) log_info("\tTesting with one base dimension +/- 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions)); - } else if (sub_test == (dimensions*2+1)) { - if (dimensions == 1) - continue; + if (DEBUG) + log_info( + "\tTesting with one base dimension +/- 1 %s.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions)); + } + else if (sub_test == (dimensions * 2 + 1)) + { + if (dimensions == 1) continue; final_x_size--; final_y_size--; final_z_size--; set_min(&final_x_size, &final_y_size, &final_z_size); - if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions)); - } else if (sub_test == (dimensions*2+2)) { - if (dimensions == 1) - continue; + if (DEBUG) + log_info( + "\tTesting with all base dimensions - 1 %s.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions)); + } + else if (sub_test == (dimensions * 2 + 2)) + { + if (dimensions == 1) continue; final_x_size++; final_y_size++; final_z_size++; set_min(&final_x_size, &final_y_size, &final_z_size); - if (DEBUG) log_info("\tTesting with all base dimensions + 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions)); - } else { - final_x_size = (int)get_random_float(0, (x_size/size_increase_per_iteration), d)+x_size/size_increase_per_iteration; - final_y_size = (int)get_random_float(0, (y_size/size_increase_per_iteration), d)+y_size/size_increase_per_iteration; - final_z_size = (int)get_random_float(0, (z_size/size_increase_per_iteration), d)+z_size/size_increase_per_iteration; + if (DEBUG) + log_info( + "\tTesting with all base dimensions + 1 %s.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions)); + } + else + { + final_x_size = + (int)get_random_float( + 0, (x_size / size_increase_per_iteration), d) + + x_size / size_increase_per_iteration; + final_y_size = + (int)get_random_float( + 0, (y_size / size_increase_per_iteration), d) + + y_size / size_increase_per_iteration; + final_z_size = + (int)get_random_float( + 0, (z_size / size_increase_per_iteration), d) + + z_size / size_increase_per_iteration; set_min(&final_x_size, &final_y_size, &final_z_size); - if (DEBUG) log_info("\tTesting with random dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions)); + if (DEBUG) + log_info( + "\tTesting with random dimensions %s.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions)); } - if (limit_size && final_x_size*final_y_size*final_z_size >= MAX_TOTAL_GLOBAL_THREADS_FOR_TEST) { - log_info("Skipping size %s as it exceeds max test threads of %d.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions), MAX_TOTAL_GLOBAL_THREADS_FOR_TEST); + if (limit_size + && final_x_size * final_y_size * final_z_size + >= MAX_TOTAL_GLOBAL_THREADS_FOR_TEST) + { + log_info("Skipping size %s as it exceeds max test " + "threads of %d.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions), + MAX_TOTAL_GLOBAL_THREADS_FOR_TEST); continue; } cl_uint local_test; cl_uint local_x_size, local_y_size, local_z_size; - cl_uint previous_local_x_size=0, previous_local_y_size=0, previous_local_z_size=0; - for (local_test = 0; local_test < local_tests_per_size; local_test++) { + cl_uint previous_local_x_size = 0, + previous_local_y_size = 0, + previous_local_z_size = 0; + for (local_test = 0; local_test < local_tests_per_size; + local_test++) + { local_x_size = 1; local_y_size = 1; local_z_size = 1; - if (local_test == 0) { - } else if (local_test <= dimensions) { - int dim_to_change = (local_test-1)%dimensions; - if (dim_to_change == 0) { + if (local_test == 0) + { + } + else if (local_test <= dimensions) + { + int dim_to_change = (local_test - 1) % dimensions; + if (dim_to_change == 0) + { local_x_size = (cl_uint)max_workgroup_size; - } else if (dim_to_change == 1) { + } + else if (dim_to_change == 1) + { local_y_size = (cl_uint)max_workgroup_size; - } else if (dim_to_change == 2) { + } + else if (dim_to_change == 2) + { local_z_size = (cl_uint)max_workgroup_size; - } else { - log_error("Invalid dim_to_change: %d\n", dim_to_change); + } + else + { + log_error("Invalid dim_to_change: %d\n", + dim_to_change); free_mtdata(d); return -1; } - } else { - local_x_size = (int)get_random_float(1, (int)max_workgroup_size, d); - while ((local_x_size > 1) && (final_x_size%local_x_size != 0)) + } + else + { + local_x_size = (int)get_random_float( + 1, (int)max_workgroup_size, d); + while ((local_x_size > 1) + && (final_x_size % local_x_size != 0)) local_x_size--; - int remainder = (int)floor((double)max_workgroup_size/local_x_size); + int remainder = (int)floor( + (double)max_workgroup_size / local_x_size); // Evenly prefer dimensions 2 and 1 first - if (local_test % 2) { - if (dimensions > 1) { - local_y_size = (int)get_random_float(1, (int)remainder, d); - while ((local_y_size > 1) && (final_y_size%local_y_size != 0)) + if (local_test % 2) + { + if (dimensions > 1) + { + local_y_size = (int)get_random_float( + 1, (int)remainder, d); + while ( + (local_y_size > 1) + && (final_y_size % local_y_size != 0)) local_y_size--; - remainder = (int)floor((double)remainder/local_y_size); + remainder = (int)floor((double)remainder + / local_y_size); } - if (dimensions > 2) { - local_z_size = (int)get_random_float(1, (int)remainder, d); - while ((local_z_size > 1) && (final_z_size%local_z_size != 0)) + if (dimensions > 2) + { + local_z_size = (int)get_random_float( + 1, (int)remainder, d); + while ( + (local_z_size > 1) + && (final_z_size % local_z_size != 0)) local_z_size--; } - } else { - if (dimensions > 2) { - local_z_size = (int)get_random_float(1, (int)remainder, d); - while ((local_z_size > 1) && (final_z_size%local_z_size != 0)) + } + else + { + if (dimensions > 2) + { + local_z_size = (int)get_random_float( + 1, (int)remainder, d); + while ( + (local_z_size > 1) + && (final_z_size % local_z_size != 0)) local_z_size--; - remainder = (int)floor((double)remainder/local_z_size); + remainder = (int)floor((double)remainder + / local_z_size); } - if (dimensions > 1) { - local_y_size = (int)get_random_float(1, (int)remainder, d); - while ((local_y_size > 1) && (final_y_size%local_y_size != 0)) + if (dimensions > 1) + { + local_y_size = (int)get_random_float( + 1, (int)remainder, d); + while ( + (local_y_size > 1) + && (final_y_size % local_y_size != 0)) local_y_size--; } } } - // Put all the threads in one dimension to speed up the test in quick mode. - if (quick_test) { + // Put all the threads in one dimension to speed up the + // test in quick mode. + if (quick_test) + { local_y_size = 1; local_z_size = 1; local_x_size = 1; - if (final_z_size > final_y_size && final_z_size > final_x_size) + if (final_z_size > final_y_size + && final_z_size > final_x_size) local_z_size = (cl_uint)max_workgroup_size; else if (final_y_size > final_x_size) local_y_size = (cl_uint)max_workgroup_size; @@ -704,40 +963,62 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue if (local_x_size > max_local_workgroup_size[0]) local_x_size = (int)max_local_workgroup_size[0]; - if (dimensions > 1 && local_y_size > max_local_workgroup_size[1]) + if (dimensions > 1 + && local_y_size > max_local_workgroup_size[1]) local_y_size = (int)max_local_workgroup_size[1]; - if (dimensions > 2 && local_z_size > max_local_workgroup_size[2]) + if (dimensions > 2 + && local_z_size > max_local_workgroup_size[2]) local_z_size = (int)max_local_workgroup_size[2]; // Cleanup the local dimensions - while ((local_x_size > 1) && (final_x_size%local_x_size != 0)) + while ((local_x_size > 1) + && (final_x_size % local_x_size != 0)) local_x_size--; - while ((local_y_size > 1) && (final_y_size%local_y_size != 0)) + while ((local_y_size > 1) + && (final_y_size % local_y_size != 0)) local_y_size--; - while ((local_z_size > 1) && (final_z_size%local_z_size != 0)) + while ((local_z_size > 1) + && (final_z_size % local_z_size != 0)) local_z_size--; - if ((previous_local_x_size == local_x_size) && (previous_local_y_size == local_y_size) && (previous_local_z_size == local_z_size)) + if ((previous_local_x_size == local_x_size) + && (previous_local_y_size == local_y_size) + && (previous_local_z_size == local_z_size)) continue; - if (explicit_local == 0) { + if (explicit_local == 0) + { local_x_size = 0; local_y_size = 0; local_z_size = 0; } - if (DEBUG) log_info("\t\tTesting local size %s.\n", print_dimensions(local_x_size, local_y_size, local_z_size, dimensions)); + if (DEBUG) + log_info( + "\t\tTesting local size %s.\n", + print_dimensions(local_x_size, local_y_size, + local_z_size, dimensions)); - if (explicit_local == 0) { - log_info("\tTesting global %s local [NULL]...\n", - print_dimensions(final_x_size, final_y_size, final_z_size, dimensions)); - } else { - log_info("\tTesting global %s local %s...\n", - print_dimensions(final_x_size, final_y_size, final_z_size, dimensions), - print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions)); + if (explicit_local == 0) + { + log_info( + "\tTesting global %s local [NULL]...\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions)); + } + else + { + log_info( + "\tTesting global %s local %s...\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions), + print_dimensions2(local_x_size, local_y_size, + local_z_size, dimensions)); } - // Avoid running with very small local sizes on very large global sizes - cl_uint total_local_size = local_x_size * local_y_size * local_z_size; + // Avoid running with very small local sizes on very + // large global sizes + cl_uint total_local_size = + local_x_size * local_y_size * local_z_size; long total_global_size = final_x_size * final_y_size * final_z_size; if (total_local_size < max_workgroup_size) { if (((total_global_size > 16384 * 16384) @@ -751,12 +1032,16 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue } } - err = run_test(context, queue, kernel, array, memory_size, dimensions, - final_x_size, final_y_size, final_z_size, - local_x_size, local_y_size, local_z_size, explicit_local); + err = + run_test(context, queue, kernel, array, memory_size, + dimensions, final_x_size, final_y_size, + final_z_size, local_x_size, local_y_size, + local_z_size, explicit_local); - // If we failed to execute, then return so we don't crash. - if (err < 0) { + // If we failed to execute, then return so we don't + // crash. + if (err < 0) + { clReleaseMemObject(array); clReleaseKernel(kernel); clReleaseProgram(program); @@ -765,10 +1050,14 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue } // Otherwise, if we had errors add them up. - if (err) { - log_error("Test global %s local %s failed.\n", - print_dimensions(final_x_size, final_y_size, final_z_size, dimensions), - print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions)); + if (err) + { + log_error( + "Test global %s local %s failed.\n", + print_dimensions(final_x_size, final_y_size, + final_z_size, dimensions), + print_dimensions2(local_x_size, local_y_size, + local_z_size, dimensions)); errors++; clReleaseMemObject(array); clReleaseKernel(kernel); @@ -783,30 +1072,23 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue previous_local_z_size = local_z_size; // Only test one config in quick mode. - if (quick_test) - break; + if (quick_test) break; } // local_test size } // sub_test // Increment the x_size - if (x_size == max_x_size) - break; + if (x_size == max_x_size) break; x_size *= size_increase_per_iteration; - if (x_size > max_x_size) - x_size = max_x_size; + if (x_size > max_x_size) x_size = max_x_size; } // x_size // Increment the y_size - if (y_size == max_y_size) - break; + if (y_size == max_y_size) break; y_size *= size_increase_per_iteration; - if (y_size > max_y_size) - y_size = max_y_size; + if (y_size > max_y_size) y_size = max_y_size; } // y_size // Increment the z_size - if (z_size == max_z_size) - break; + if (z_size == max_z_size) break; z_size *= size_increase_per_iteration; - if (z_size > max_z_size) - z_size = max_z_size; + if (z_size > max_z_size) z_size = max_z_size; } // z_size @@ -814,75 +1096,108 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue clReleaseMemObject(array); clReleaseKernel(kernel); clReleaseProgram(program); - if (errors) - log_error("%d total errors.\n", errors); + if (errors) log_error("%d total errors.\n", errors); return errors; - } #define QUICK 1 #define FULL 0 -int test_quick_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_quick_1d_explicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*512, QUICK, 4, 1); + return test_thread_dimensions( + deviceID, context, queue, 1, 1, + maxThreadDimension ? maxThreadDimension : 65536 * 512, QUICK, 4, 1); } -int test_quick_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_quick_2d_explicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, QUICK, 16, 1); + return test_thread_dimensions( + deviceID, context, queue, 2, 1, + maxThreadDimension ? maxThreadDimension : 65536 / 4, QUICK, 16, 1); } -int test_quick_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_quick_3d_explicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, QUICK, 32, 1); + return test_thread_dimensions( + deviceID, context, queue, 3, 1, + maxThreadDimension ? maxThreadDimension : 1024, QUICK, 32, 1); } -int test_quick_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_quick_1d_implicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*256, QUICK, 4, 0); + return test_thread_dimensions( + deviceID, context, queue, 1, 1, + maxThreadDimension ? maxThreadDimension : 65536 * 256, QUICK, 4, 0); } -int test_quick_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_quick_2d_implicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, QUICK, 16, 0); + return test_thread_dimensions( + deviceID, context, queue, 2, 1, + maxThreadDimension ? maxThreadDimension : 65536 / 4, QUICK, 16, 0); } -int test_quick_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_quick_3d_implicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, QUICK, 32, 0); + return test_thread_dimensions( + deviceID, context, queue, 3, 1, + maxThreadDimension ? maxThreadDimension : 1024, QUICK, 32, 0); } -int test_full_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_full_1d_explicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*512, FULL, 4, 1); + return test_thread_dimensions( + deviceID, context, queue, 1, 1, + maxThreadDimension ? maxThreadDimension : 65536 * 512, FULL, 4, 1); } -int test_full_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_full_2d_explicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, FULL, 16, 1); + return test_thread_dimensions( + deviceID, context, queue, 2, 1, + maxThreadDimension ? maxThreadDimension : 65536 / 4, FULL, 16, 1); } -int test_full_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_full_3d_explicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, FULL, 32, 1); + return test_thread_dimensions( + deviceID, context, queue, 3, 1, + maxThreadDimension ? maxThreadDimension : 1024, FULL, 32, 1); } -int test_full_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_full_1d_implicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*256, FULL, 4, 0); + return test_thread_dimensions( + deviceID, context, queue, 1, 1, + maxThreadDimension ? maxThreadDimension : 65536 * 256, FULL, 4, 0); } -int test_full_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_full_2d_implicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, FULL, 16, 0); + return test_thread_dimensions( + deviceID, context, queue, 2, 1, + maxThreadDimension ? maxThreadDimension : 65536 / 4, FULL, 16, 0); } -int test_full_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_full_3d_implicit_local(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, FULL, 32, 0); + return test_thread_dimensions( + deviceID, context, queue, 3, 1, + maxThreadDimension ? maxThreadDimension : 1024, FULL, 32, 0); } -