From 7188c4b29b9fbe6797d9440bd1c7b92ab6a95d28 Mon Sep 17 00:00:00 2001 From: Sreelakshmi Haridas Maruthur Date: Tue, 11 Feb 2025 09:49:34 -0700 Subject: [PATCH] allocations: Make buffer kernel more efficient for multiple allocations (#2235) - Fix malloc for 'access_string' and 'kernel_string'. - Fix typo in 'number_of_work_itmes'. Co-authored-by: Sreelakshmi Haridas Maruthur --- .../allocations/allocation_execute.cpp | 57 ++++++++++++------- test_conformance/allocations/main.cpp | 6 +- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/test_conformance/allocations/allocation_execute.cpp b/test_conformance/allocations/allocation_execute.cpp index f01dfd8f..3af7f85e 100644 --- a/test_conformance/allocations/allocation_execute.cpp +++ b/test_conformance/allocations/allocation_execute.cpp @@ -26,13 +26,18 @@ const char *buffer_kernel_pattern = { "\tint tid = get_global_id(0);\n" "\tuint r = 0;\n" "\t%s i;\n" - "\tfor(i=(%s)tid*(%s)per_item; i<(%s)(1+tid)*(%s)per_item; i++) {\n" "%s" - "\t}\n" "\tresult[tid] = r;\n" "}\n" }; +const char *accumulate_pattern = { + "\t%s end%d = min((%s)(1+tid)*(%s)per_item, array_sizes[%d]);\n" + "\tfor(i=(%s)tid*(%s)per_item; i returned_results(number_of_work_itmes); + std::vector returned_results(number_of_work_items); clEventWrapper event; cl_int event_status; // Allocate memory for the kernel source + char *used_pattern = nullptr; + if (test == BUFFER || test == BUFFER_NON_BLOCKING) + { + used_pattern = (char *)accumulate_pattern; + } + else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING) + { + used_pattern = (char *)read_pattern; + } + else if (test == IMAGE_WRITE || test == IMAGE_WRITE_NON_BLOCKING) + { + used_pattern = (char *)write_pattern; + } argument_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE * 64); access_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE - * (strlen(read_pattern) + 10)); + * (strlen(used_pattern) + 10)); kernel_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE - * (strlen(read_pattern) + 10 + 64) + * (strlen(used_pattern) + 10 + 64) + 1024); + argument_string[0] = '\0'; access_string[0] = '\0'; kernel_string[0] = '\0'; // Zero the results. - for (i = 0; i < number_of_work_itmes; i++) returned_results[i] = 0; + for (i = 0; i < number_of_work_items; i++) returned_results[i] = 0; // detect if device supports ulong/int64 // detect whether profile of the device is embedded @@ -209,13 +228,6 @@ int execute_kernel(cl_context context, cl_command_queue *queue, // Build the kernel source if (test == BUFFER || test == BUFFER_NON_BLOCKING) { - for (i = 0; i < number_of_mems_used; i++) - { - sprintf(argument_string + strlen(argument_string), - " __global uint *buffer%d, ", i); - sprintf(access_string + strlen(access_string), - "\t\tif (i