Thread Dimensions: Improve execution time. (#694)

Replaced a kernel used for memseting a large buffer with clEnqueueFillBuffer.

Fixes #692. Addresses #684.
This commit is contained in:
Jeremy Kemp
2020-03-24 13:21:56 +00:00
committed by GitHub
parent 46b6bae001
commit efaf024035

View File

@@ -139,12 +139,6 @@ static const char *thread_dimension_kernel_code_atomic_long =
" if (error)\n" " if (error)\n"
" atom_or(&dst[t_address-start_address], error);\n" " atom_or(&dst[t_address-start_address], error);\n"
"\n" "\n"
"}\n"
"\n"
"__kernel void clear_memory(__global uint *dst)\n\n"
"{\n"
" dst[get_global_id(0)] = 0;\n"
"\n"
"}\n"; "}\n";
static const char *thread_dimension_kernel_code_not_atomic_long = static const char *thread_dimension_kernel_code_not_atomic_long =
@@ -168,12 +162,6 @@ static const char *thread_dimension_kernel_code_not_atomic_long =
" if (error)\n" " if (error)\n"
" dst[t_address-start_address]|=error;\n" " dst[t_address-start_address]|=error;\n"
"\n" "\n"
"}\n"
"\n"
"__kernel void clear_memory(__global uint *dst)\n\n"
"{\n"
" dst[get_global_id(0)] = 0;\n"
"\n"
"}\n"; "}\n";
static const char *thread_dimension_kernel_code_atomic_not_long = static const char *thread_dimension_kernel_code_atomic_not_long =
@@ -199,12 +187,6 @@ static const char *thread_dimension_kernel_code_atomic_not_long =
" if (error)\n" " if (error)\n"
" atom_or(&dst[t_address-start_address], error);\n" " atom_or(&dst[t_address-start_address], error);\n"
"\n" "\n"
"}\n"
"\n"
"__kernel void clear_memory(__global uint *dst)\n\n"
"{\n"
" dst[get_global_id(0)] = 0;\n"
"\n"
"}\n"; "}\n";
static const char *thread_dimension_kernel_code_not_atomic_not_long = static const char *thread_dimension_kernel_code_not_atomic_not_long =
@@ -228,20 +210,8 @@ static const char *thread_dimension_kernel_code_not_atomic_not_long =
" if (error)\n" " if (error)\n"
" dst[t_address-start_address]|=error;\n" " dst[t_address-start_address]|=error;\n"
"\n" "\n"
"}\n"
"\n"
"__kernel void clear_memory(__global uint *dst)\n\n"
"{\n"
" dst[get_global_id(0)] = 0;\n"
"\n"
"}\n"; "}\n";
static size_t max_workgroup_size_for_clear_kernel;
cl_kernel clear_memory_kernel = 0;
char dim_str[128]; char dim_str[128];
char * char *
print_dimensions(size_t x, size_t y, size_t z, cl_uint dim) { print_dimensions(size_t x, size_t y, size_t z, cl_uint dim) {
@@ -307,31 +277,19 @@ int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, cl_me
while (end_valid_memory_address <= last_memory_address) while (end_valid_memory_address <= last_memory_address)
{ {
int err; int err;
// Clear the memory const int fill_pattern = 0x0;
// // Manually -- much slower on the GPU err = clEnqueueFillBuffer(queue,
// memset((void*)data, 0, memory_size); array,
// err = clWriteArray(context, array, 0, 0, memory_size, data, NULL); (void*)&fill_pattern,
// if (err != CL_SUCCESS) { sizeof(fill_pattern),
// log_error("Failed to write to data array: %d\n", err); 0,
// free(data); memory_size,
// return -4; 0,
// } NULL,
// In a kernel NULL);
err = clSetKernelArg(clear_memory_kernel, 0, sizeof(array), &array);
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
print_error( err, "Failed to set args for clear_memory_kernel to clear the memory between runs"); print_error( err, "Failed to set fill buffer.");
return -4; return -3;
}
size_t global[3] = {1,0,0};
global[0] = (cl_uint)(memory_size/sizeof(cl_uint));
size_t local[3] = {1,0,0};
local[0] = max_workgroup_size_for_clear_kernel;
while( global[0] % local[0] ) //make sure that global[0] is evenly divided by local[0]. Will stop at 1 in worst case.
local[0]--;
err = clEnqueueNDRangeKernel(queue, clear_memory_kernel, 1, NULL, global, local, 0, NULL, NULL);
if (err != CL_SUCCESS) {
print_error( err, "Failed to execute clear_memory_kernel to clear the memory between runs");
return -4;
} }
cl_ulong start_valid_index = start_valid_memory_address/sizeof(cl_uint); cl_ulong start_valid_index = start_valid_memory_address/sizeof(cl_uint);
@@ -513,13 +471,6 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL); err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES"); test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
clear_memory_kernel = clCreateKernel(program, "clear_memory", &err);
if (err)
{
log_error("clCreateKernel failed: %d\n", err);
return -1;
}
// Get the maximum sizes supported by this device // Get the maximum sizes supported by this device
size_t max_workgroup_size = 0; size_t max_workgroup_size = 0;
cl_ulong max_allocation = 0; cl_ulong max_allocation = 0;
@@ -529,9 +480,6 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
err = get_maximums(kernel, context, err = get_maximums(kernel, context,
&max_workgroup_size, &max_allocation, &max_physical); &max_workgroup_size, &max_allocation, &max_physical);
err = get_maximums(clear_memory_kernel, context,
&max_workgroup_size_for_clear_kernel, &max_allocation, &max_physical);
// Make sure we don't try to allocate more than half the physical memory present. // Make sure we don't try to allocate more than half the physical memory present.
if (max_allocation > (max_physical/2)) { if (max_allocation > (max_physical/2)) {
log_info("Limiting max allocation to half of the maximum physical memory (%gMB of %gMB physical).\n", log_info("Limiting max allocation to half of the maximum physical memory (%gMB of %gMB physical).\n",
@@ -820,7 +768,6 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
errors++; errors++;
clReleaseMemObject(array); clReleaseMemObject(array);
clReleaseKernel(kernel); clReleaseKernel(kernel);
clReleaseKernel(clear_memory_kernel);
clReleaseProgram(program); clReleaseProgram(program);
free_mtdata(d); free_mtdata(d);
return -1; return -1;
@@ -862,7 +809,6 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
free_mtdata(d); free_mtdata(d);
clReleaseMemObject(array); clReleaseMemObject(array);
clReleaseKernel(kernel); clReleaseKernel(kernel);
clReleaseKernel(clear_memory_kernel);
clReleaseProgram(program); clReleaseProgram(program);
if (errors) if (errors)
log_error("%d total errors.\n", errors); log_error("%d total errors.\n", errors);