mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-26 00:39:03 +00:00
allocations: Make buffer kernel more efficient for multiple allocations (#2235)
- Fix malloc for 'access_string' and 'kernel_string'. - Fix typo in 'number_of_work_itmes'. Co-authored-by: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
This commit is contained in:
committed by
GitHub
parent
044ec98f66
commit
7188c4b29b
@@ -26,13 +26,18 @@ const char *buffer_kernel_pattern = {
|
|||||||
"\tint tid = get_global_id(0);\n"
|
"\tint tid = get_global_id(0);\n"
|
||||||
"\tuint r = 0;\n"
|
"\tuint r = 0;\n"
|
||||||
"\t%s i;\n"
|
"\t%s i;\n"
|
||||||
"\tfor(i=(%s)tid*(%s)per_item; i<(%s)(1+tid)*(%s)per_item; i++) {\n"
|
|
||||||
"%s"
|
"%s"
|
||||||
"\t}\n"
|
|
||||||
"\tresult[tid] = r;\n"
|
"\tresult[tid] = r;\n"
|
||||||
"}\n"
|
"}\n"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const char *accumulate_pattern = {
|
||||||
|
"\t%s end%d = min((%s)(1+tid)*(%s)per_item, array_sizes[%d]);\n"
|
||||||
|
"\tfor(i=(%s)tid*(%s)per_item; i<end%d; i++) {\n"
|
||||||
|
"\t\tr += buffer%d[i];\n"
|
||||||
|
"\t}\n"
|
||||||
|
};
|
||||||
|
|
||||||
const char *image_kernel_pattern = {
|
const char *image_kernel_pattern = {
|
||||||
"__kernel void sample_test(%s __global uint *result)\n"
|
"__kernel void sample_test(%s __global uint *result)\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
@@ -159,7 +164,7 @@ int check_image(cl_command_queue queue, cl_mem mem)
|
|||||||
int execute_kernel(cl_context context, cl_command_queue *queue,
|
int execute_kernel(cl_context context, cl_command_queue *queue,
|
||||||
cl_device_id device_id, int test, cl_mem mems[],
|
cl_device_id device_id, int test, cl_mem mems[],
|
||||||
int number_of_mems_used, int verify_checksum,
|
int number_of_mems_used, int verify_checksum,
|
||||||
unsigned int number_of_work_itmes)
|
unsigned int number_of_work_items)
|
||||||
{
|
{
|
||||||
|
|
||||||
char *argument_string;
|
char *argument_string;
|
||||||
@@ -174,24 +179,38 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
|
|||||||
cl_uint per_item;
|
cl_uint per_item;
|
||||||
cl_uint per_item_uint;
|
cl_uint per_item_uint;
|
||||||
cl_uint final_result;
|
cl_uint final_result;
|
||||||
std::vector<cl_uint> returned_results(number_of_work_itmes);
|
std::vector<cl_uint> returned_results(number_of_work_items);
|
||||||
clEventWrapper event;
|
clEventWrapper event;
|
||||||
cl_int event_status;
|
cl_int event_status;
|
||||||
|
|
||||||
// Allocate memory for the kernel source
|
// Allocate memory for the kernel source
|
||||||
|
char *used_pattern = nullptr;
|
||||||
|
if (test == BUFFER || test == BUFFER_NON_BLOCKING)
|
||||||
|
{
|
||||||
|
used_pattern = (char *)accumulate_pattern;
|
||||||
|
}
|
||||||
|
else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING)
|
||||||
|
{
|
||||||
|
used_pattern = (char *)read_pattern;
|
||||||
|
}
|
||||||
|
else if (test == IMAGE_WRITE || test == IMAGE_WRITE_NON_BLOCKING)
|
||||||
|
{
|
||||||
|
used_pattern = (char *)write_pattern;
|
||||||
|
}
|
||||||
argument_string =
|
argument_string =
|
||||||
(char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE * 64);
|
(char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE * 64);
|
||||||
access_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
|
access_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
|
||||||
* (strlen(read_pattern) + 10));
|
* (strlen(used_pattern) + 10));
|
||||||
kernel_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
|
kernel_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
|
||||||
* (strlen(read_pattern) + 10 + 64)
|
* (strlen(used_pattern) + 10 + 64)
|
||||||
+ 1024);
|
+ 1024);
|
||||||
|
|
||||||
argument_string[0] = '\0';
|
argument_string[0] = '\0';
|
||||||
access_string[0] = '\0';
|
access_string[0] = '\0';
|
||||||
kernel_string[0] = '\0';
|
kernel_string[0] = '\0';
|
||||||
|
|
||||||
// Zero the results.
|
// Zero the results.
|
||||||
for (i = 0; i < number_of_work_itmes; i++) returned_results[i] = 0;
|
for (i = 0; i < number_of_work_items; i++) returned_results[i] = 0;
|
||||||
|
|
||||||
// detect if device supports ulong/int64
|
// detect if device supports ulong/int64
|
||||||
// detect whether profile of the device is embedded
|
// detect whether profile of the device is embedded
|
||||||
@@ -209,13 +228,6 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
|
|||||||
// Build the kernel source
|
// Build the kernel source
|
||||||
if (test == BUFFER || test == BUFFER_NON_BLOCKING)
|
if (test == BUFFER || test == BUFFER_NON_BLOCKING)
|
||||||
{
|
{
|
||||||
for (i = 0; i < number_of_mems_used; i++)
|
|
||||||
{
|
|
||||||
sprintf(argument_string + strlen(argument_string),
|
|
||||||
" __global uint *buffer%d, ", i);
|
|
||||||
sprintf(access_string + strlen(access_string),
|
|
||||||
"\t\tif (i<array_sizes[%d]) r += buffer%d[i];\n", i, i);
|
|
||||||
}
|
|
||||||
char type[10];
|
char type[10];
|
||||||
if (support64)
|
if (support64)
|
||||||
{
|
{
|
||||||
@@ -225,8 +237,15 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
|
|||||||
{
|
{
|
||||||
sprintf(type, "uint");
|
sprintf(type, "uint");
|
||||||
}
|
}
|
||||||
|
for (i = 0; i < number_of_mems_used; i++)
|
||||||
|
{
|
||||||
|
sprintf(argument_string + strlen(argument_string),
|
||||||
|
" __global uint *buffer%d, ", i);
|
||||||
|
sprintf(access_string + strlen(access_string), accumulate_pattern,
|
||||||
|
type, i, type, type, i, type, type, i, i);
|
||||||
|
}
|
||||||
sprintf(kernel_string, buffer_kernel_pattern, argument_string, type,
|
sprintf(kernel_string, buffer_kernel_pattern, argument_string, type,
|
||||||
type, type, type, type, type, access_string);
|
type, access_string);
|
||||||
}
|
}
|
||||||
else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING)
|
else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING)
|
||||||
{
|
{
|
||||||
@@ -282,14 +301,14 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
|
|||||||
// Set the result
|
// Set the result
|
||||||
result_mem =
|
result_mem =
|
||||||
clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
|
clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
|
||||||
sizeof(cl_uint) * number_of_work_itmes,
|
sizeof(cl_uint) * number_of_work_items,
|
||||||
returned_results.data(), &error);
|
returned_results.data(), &error);
|
||||||
test_error(error, "clCreateBuffer failed");
|
test_error(error, "clCreateBuffer failed");
|
||||||
error = clSetKernelArg(kernel, i, sizeof(result_mem), &result_mem);
|
error = clSetKernelArg(kernel, i, sizeof(result_mem), &result_mem);
|
||||||
test_error(error, "clSetKernelArg failed");
|
test_error(error, "clSetKernelArg failed");
|
||||||
|
|
||||||
// Thread dimensions for execution
|
// Thread dimensions for execution
|
||||||
global_dims[0] = number_of_work_itmes;
|
global_dims[0] = number_of_work_items;
|
||||||
global_dims[1] = 1;
|
global_dims[1] = 1;
|
||||||
global_dims[2] = 1;
|
global_dims[2] = 1;
|
||||||
|
|
||||||
@@ -427,7 +446,7 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
|
|||||||
// Verify the checksum.
|
// Verify the checksum.
|
||||||
// Read back the result
|
// Read back the result
|
||||||
error = clEnqueueReadBuffer(*queue, result_mem, CL_TRUE, 0,
|
error = clEnqueueReadBuffer(*queue, result_mem, CL_TRUE, 0,
|
||||||
sizeof(cl_uint) * number_of_work_itmes,
|
sizeof(cl_uint) * number_of_work_items,
|
||||||
returned_results.data(), 0, NULL, NULL);
|
returned_results.data(), 0, NULL, NULL);
|
||||||
test_error_abort(error, "clEnqueueReadBuffer failed");
|
test_error_abort(error, "clEnqueueReadBuffer failed");
|
||||||
final_result = 0;
|
final_result = 0;
|
||||||
@@ -436,7 +455,7 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
|
|||||||
{
|
{
|
||||||
// For buffers or read images we are just looking at the sum of what
|
// For buffers or read images we are just looking at the sum of what
|
||||||
// each thread summed up
|
// each thread summed up
|
||||||
for (i = 0; i < number_of_work_itmes; i++)
|
for (i = 0; i < number_of_work_items; i++)
|
||||||
{
|
{
|
||||||
final_result += returned_results[i];
|
final_result += returned_results[i];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
|
|||||||
int number_of_mems_used;
|
int number_of_mems_used;
|
||||||
cl_ulong max_individual_allocation_size = g_max_individual_allocation_size;
|
cl_ulong max_individual_allocation_size = g_max_individual_allocation_size;
|
||||||
cl_ulong global_mem_size = g_global_mem_size;
|
cl_ulong global_mem_size = g_global_mem_size;
|
||||||
unsigned int number_of_work_itmes = 8192 * 32;
|
unsigned int number_of_work_items = 8192 * 32;
|
||||||
const bool allocate_image =
|
const bool allocate_image =
|
||||||
(alloc_type != BUFFER) && (alloc_type != BUFFER_NON_BLOCKING);
|
(alloc_type != BUFFER) && (alloc_type != BUFFER_NON_BLOCKING);
|
||||||
|
|
||||||
@@ -183,7 +183,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
|
|||||||
g_reduction_percentage);
|
g_reduction_percentage);
|
||||||
g_max_size = (size_t)((double)g_max_size
|
g_max_size = (size_t)((double)g_max_size
|
||||||
* (double)g_reduction_percentage / 100.0);
|
* (double)g_reduction_percentage / 100.0);
|
||||||
number_of_work_itmes = 8192 * 2;
|
number_of_work_items = 8192 * 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Round to nearest MB.
|
// Round to nearest MB.
|
||||||
@@ -220,7 +220,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
|
|||||||
error =
|
error =
|
||||||
execute_kernel(context, &queue, device, alloc_type, mems,
|
execute_kernel(context, &queue, device, alloc_type, mems,
|
||||||
number_of_mems_used, g_write_allocations,
|
number_of_mems_used, g_write_allocations,
|
||||||
number_of_work_itmes);
|
number_of_work_items);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we failed to allocate more than 1/8th of the requested amount
|
// If we failed to allocate more than 1/8th of the requested amount
|
||||||
|
|||||||
Reference in New Issue
Block a user