mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 14:09:03 +00:00
The maintenance of the conformance tests is moving to Github. This commit contains all the changes that have been done in Gitlab since the first public release of the conformance tests. Signed-off-by: Kevin Petit <kevin.petit@arm.com>
994 lines
43 KiB
C
994 lines
43 KiB
C
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <string.h>
|
|
|
|
#if !defined(_WIN32)
|
|
#include <stdbool.h>
|
|
#endif
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
|
|
#include "procs.h"
|
|
|
|
#include "../../test_common/harness/compat.h"
|
|
|
|
#define ITERATIONS 4
|
|
#define DEBUG 0
|
|
|
|
// If the environment variable DO_NOT_LIMIT_THREAD_SIZE is not set, the test will limit the maximum total
|
|
// global dimensions tested to this value.
|
|
#define MAX_TOTAL_GLOBAL_THREADS_FOR_TEST (1<<24)
|
|
int limit_size = 0;
|
|
|
|
static int
|
|
get_maximums(cl_kernel kernel, cl_context context,
|
|
size_t *max_workgroup_size_result,
|
|
size_t *max_width_result,
|
|
size_t *max_height_result,
|
|
cl_ulong *max_allcoation_result,
|
|
cl_ulong *max_physical_result) {
|
|
int err = 0;
|
|
cl_uint i;
|
|
cl_device_id *devices;
|
|
|
|
// Get all the devices in the device group
|
|
size_t num_devices_returned;
|
|
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &num_devices_returned);
|
|
if(err != CL_SUCCESS)
|
|
{
|
|
log_error("clGetContextInfo() failed (%d).\n", err);
|
|
return -10;
|
|
}
|
|
devices = (cl_device_id *)malloc(num_devices_returned);
|
|
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, num_devices_returned, devices, NULL);
|
|
if(err != CL_SUCCESS)
|
|
{
|
|
log_error("clGetContextInfo() failed (%d).\n", err);
|
|
return -10;
|
|
}
|
|
num_devices_returned /= sizeof(cl_device_id);
|
|
if (num_devices_returned > 1) log_info("%d devices in device group.\n", (int)num_devices_returned);
|
|
if (num_devices_returned < 1) {
|
|
log_error("0 devices found for this kernel.\n");
|
|
return -1;
|
|
}
|
|
|
|
// Iterate over them and find the maximum local workgroup size and image size
|
|
size_t max_workgroup_size = 0;
|
|
size_t current_workgroup_size = 0;
|
|
size_t max_width = 0;
|
|
size_t current_width = 0;
|
|
size_t max_height = 0;
|
|
size_t current_height = 0;
|
|
cl_ulong max_allocation = 0;
|
|
cl_ulong current_allocation = 0;
|
|
cl_ulong max_physical = 0;
|
|
cl_ulong current_physical = 0;
|
|
|
|
for (i=0; i<num_devices_returned; i++) {
|
|
// Max workgroup size for this kernel on this device
|
|
err = clGetKernelWorkGroupInfo(kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(current_workgroup_size), ¤t_workgroup_size, NULL);
|
|
if(err != CL_SUCCESS)
|
|
{
|
|
log_error("clGetKernelWorkGroupInfo() failed (%d) for device %d.\n", err, i);
|
|
return -10;
|
|
}
|
|
if (max_workgroup_size == 0)
|
|
max_workgroup_size = current_workgroup_size;
|
|
else if (current_workgroup_size < max_workgroup_size)
|
|
max_workgroup_size = current_workgroup_size;
|
|
|
|
// Max image width for this device
|
|
err = clGetDeviceInfo(devices[i], CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(current_width), ¤t_width, NULL);
|
|
if(err != CL_SUCCESS)
|
|
{
|
|
log_error("clGetDeviceConfigInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH) failed (%d) for device %d.\n", err, i);
|
|
return -10;
|
|
}
|
|
if (max_width == 0)
|
|
max_width = current_width;
|
|
else if (current_width < max_width)
|
|
max_width = current_width;
|
|
|
|
// Max image height for this device
|
|
err = clGetDeviceInfo(devices[i], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(current_height), ¤t_height, NULL);
|
|
if(err != CL_SUCCESS)
|
|
{
|
|
log_error("clGetDeviceConfigInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT) failed (%d) for device %d.\n", err, i);
|
|
return -10;
|
|
}
|
|
if (max_height == 0)
|
|
max_height = current_height;
|
|
else if (current_height < max_height)
|
|
max_height = current_height;
|
|
|
|
// Get the maximum allocation size
|
|
err = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(current_allocation), ¤t_allocation, NULL);
|
|
if(err != CL_SUCCESS)
|
|
{
|
|
log_error("clGetDeviceConfigInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed (%d) for device %d.\n", err, i);
|
|
return -10;
|
|
}
|
|
if (max_allocation == 0)
|
|
max_allocation = current_allocation;
|
|
else if (current_allocation < max_allocation)
|
|
max_allocation = current_allocation;
|
|
|
|
// Get the maximum physical size
|
|
err = clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(current_physical), ¤t_physical, NULL);
|
|
if(err != CL_SUCCESS)
|
|
{
|
|
log_error("clGetDeviceConfigInfo(CL_DEVICE_GLOBAL_MEM_SIZE) failed (%d) for device %d.\n", err, i);
|
|
return -10;
|
|
}
|
|
if (max_physical == 0)
|
|
max_physical = current_physical;
|
|
else if (current_physical < max_allocation)
|
|
max_physical = current_physical;
|
|
}
|
|
free(devices);
|
|
|
|
log_info("Device maximums: max local workgroup size:%d, max width: %d, max height: %d, max allocation size: %g MB, max physical memory %gMB\n",
|
|
(int)max_workgroup_size, (int)max_width, (int)max_height, (double)(max_allocation/1024.0/1024.0), (double)(max_physical/1024.0/1024.0));
|
|
*max_workgroup_size_result = max_workgroup_size;
|
|
*max_width_result = max_width;
|
|
*max_height_result = max_height;
|
|
*max_allcoation_result = max_allocation;
|
|
*max_physical_result = max_physical;
|
|
return 0;
|
|
}
|
|
|
|
static const char *thread_dimension_kernel_code_atomic_long =
|
|
"\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
|
|
"__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
|
|
" uint final_x_size, uint final_y_size, uint final_z_size,\n"
|
|
" ulong start_address, ulong end_address)\n"
|
|
"{\n"
|
|
" uint error = 0;\n"
|
|
" if (get_global_id(0) >= final_x_size)\n"
|
|
" error = 64;\n"
|
|
" if (get_global_id(1) >= final_y_size)\n"
|
|
" error = 128;\n"
|
|
" if (get_global_id(2) >= final_z_size)\n"
|
|
" error = 256;\n"
|
|
"\n"
|
|
" unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n"
|
|
" (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n"
|
|
" if ((t_address >= start_address) && (t_address < end_address))\n"
|
|
" atom_add(&dst[t_address-start_address], 1u);\n"
|
|
" if (error)\n"
|
|
" atom_or(&dst[t_address-start_address], error);\n"
|
|
"\n"
|
|
"}\n"
|
|
"\n"
|
|
"__kernel void clear_memory(__global uint *dst)\n\n"
|
|
"{\n"
|
|
" dst[get_global_id(0)] = 0;\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
static const char *thread_dimension_kernel_code_not_atomic_long =
|
|
"\n"
|
|
"__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
|
|
" uint final_x_size, uint final_y_size, uint final_z_size,\n"
|
|
" ulong start_address, ulong end_address)\n"
|
|
"{\n"
|
|
" uint error = 0;\n"
|
|
" if (get_global_id(0) >= final_x_size)\n"
|
|
" error = 64;\n"
|
|
" if (get_global_id(1) >= final_y_size)\n"
|
|
" error = 128;\n"
|
|
" if (get_global_id(2) >= final_z_size)\n"
|
|
" error = 256;\n"
|
|
"\n"
|
|
" unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n"
|
|
" (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n"
|
|
" if ((t_address >= start_address) && (t_address < end_address))\n"
|
|
" dst[t_address-start_address]++;\n"
|
|
" if (error)\n"
|
|
" dst[t_address-start_address]|=error;\n"
|
|
"\n"
|
|
"}\n"
|
|
"\n"
|
|
"__kernel void clear_memory(__global uint *dst)\n\n"
|
|
"{\n"
|
|
" dst[get_global_id(0)] = 0;\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
static const char *thread_dimension_kernel_code_atomic_not_long =
|
|
"\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
|
|
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
|
|
"__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
|
|
" uint final_x_size, uint final_y_size, uint final_z_size,\n"
|
|
" uint start_address, uint end_address)\n"
|
|
"{\n"
|
|
" uint error = 0;\n"
|
|
" if (get_global_id(0) >= final_x_size)\n"
|
|
" error = 64;\n"
|
|
" if (get_global_id(1) >= final_y_size)\n"
|
|
" error = 128;\n"
|
|
" if (get_global_id(2) >= final_z_size)\n"
|
|
" error = 256;\n"
|
|
"\n"
|
|
" unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_ size + \n"
|
|
" (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n"
|
|
" if ((t_address >= start_address) && (t_address < end_address))\n"
|
|
" atom_add(&dst[t_address-start_address], 1u);\n"
|
|
" if (error)\n"
|
|
" atom_or(&dst[t_address-start_address], error);\n"
|
|
"\n"
|
|
"}\n"
|
|
"\n"
|
|
"__kernel void clear_memory(__global uint *dst)\n\n"
|
|
"{\n"
|
|
" dst[get_global_id(0)] = 0;\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
static const char *thread_dimension_kernel_code_not_atomic_not_long =
|
|
"\n"
|
|
"__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
|
|
" uint final_x_size, uint final_y_size, uint final_z_size,\n"
|
|
" uint start_address, uint end_address)\n"
|
|
"{\n"
|
|
" uint error = 0;\n"
|
|
" if (get_global_id(0) >= final_x_size)\n"
|
|
" error = 64;\n"
|
|
" if (get_global_id(1) >= final_y_size)\n"
|
|
" error = 128;\n"
|
|
" if (get_global_id(2) >= final_z_size)\n"
|
|
" error = 256;\n"
|
|
"\n"
|
|
" unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_ size + \n"
|
|
" (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n"
|
|
" if ((t_address >= start_address) && (t_address < end_address))\n"
|
|
" dst[t_address-start_address]++;\n"
|
|
" if (error)\n"
|
|
" dst[t_address-start_address]|=error;\n"
|
|
"\n"
|
|
"}\n"
|
|
"\n"
|
|
"__kernel void clear_memory(__global uint *dst)\n\n"
|
|
"{\n"
|
|
" dst[get_global_id(0)] = 0;\n"
|
|
"\n"
|
|
"}\n";
|
|
|
|
|
|
|
|
static size_t max_workgroup_size_for_clear_kernel;
|
|
cl_kernel clear_memory_kernel = 0;
|
|
|
|
|
|
char dim_str[128];
|
|
char *
|
|
print_dimensions(size_t x, size_t y, size_t z, cl_uint dim) {
|
|
// Not thread safe...
|
|
if (dim == 1) {
|
|
snprintf(dim_str, 128, "[%d]", (int)x);
|
|
} else if (dim == 2) {
|
|
snprintf(dim_str, 128, "[%d x %d]", (int)x, (int)y);
|
|
} else if (dim == 3) {
|
|
snprintf(dim_str, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z);
|
|
} else {
|
|
snprintf(dim_str, 128, "INVALID DIM: %d", dim);
|
|
}
|
|
return dim_str;
|
|
}
|
|
|
|
char dim_str2[128];
|
|
char *
|
|
print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim) {
|
|
// Not thread safe...
|
|
if (dim == 1) {
|
|
snprintf(dim_str2, 128, "[%d]", (int)x);
|
|
} else if (dim == 2) {
|
|
snprintf(dim_str2, 128, "[%d x %d]", (int)x, (int)y);
|
|
} else if (dim == 3) {
|
|
snprintf(dim_str2, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z);
|
|
} else {
|
|
snprintf(dim_str2, 128, "INVALID DIM: %d", dim);
|
|
}
|
|
return dim_str2;
|
|
}
|
|
|
|
|
|
/*
|
|
This tests thread dimensions by executing a kernel across a range of dimensions.
|
|
Each kernel instance does an atomic write into a specific location in a buffer to
|
|
ensure that the correct dimensions are run. To handle large dimensions, the kernel
|
|
masks its execution region internally. This allows a small (128MB) buffer to be used
|
|
for very large executions by running the kernel multiple times.
|
|
*/
|
|
int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, cl_mem array, cl_uint memory_size, cl_uint dimensions,
|
|
cl_uint final_x_size, cl_uint final_y_size, cl_uint final_z_size,
|
|
cl_uint local_x_size, cl_uint local_y_size, cl_uint local_z_size,
|
|
int explict_local)
|
|
{
|
|
cl_uint errors = 0;
|
|
size_t global_size[3], local_size[3];
|
|
global_size[0] = final_x_size; local_size[0] = local_x_size;
|
|
global_size[1] = final_y_size; local_size[1] = local_y_size;
|
|
global_size[2] = final_z_size; local_size[2] = local_z_size;
|
|
|
|
cl_ulong start_valid_memory_address = 0;
|
|
cl_ulong end_valid_memory_address = memory_size;
|
|
cl_ulong last_memory_address = (cl_ulong)final_x_size*(cl_ulong)final_y_size*(cl_ulong)final_z_size*sizeof(cl_uint);
|
|
if (end_valid_memory_address > last_memory_address)
|
|
end_valid_memory_address = last_memory_address;
|
|
|
|
int number_of_iterations_required = (int)ceil((double)last_memory_address/(double)memory_size);
|
|
log_info("\t\tTest requires %gMB (%d test iterations using an allocation of %gMB).\n",
|
|
(double)last_memory_address/(1024.0*1024.0), number_of_iterations_required, (double)memory_size/(1024.0*1024.0));
|
|
//log_info("Last memory address: %llu, memory_size: %llu\n", last_memory_address, memory_size);
|
|
|
|
while (end_valid_memory_address <= last_memory_address)
|
|
{
|
|
int err;
|
|
// Clear the memory
|
|
// // Manually -- much slower on the GPU
|
|
// memset((void*)data, 0, memory_size);
|
|
// err = clWriteArray(context, array, 0, 0, memory_size, data, NULL);
|
|
// if (err != CL_SUCCESS) {
|
|
// log_error("Failed to write to data array: %d\n", err);
|
|
// free(data);
|
|
// return -4;
|
|
// }
|
|
// In a kernel
|
|
err = clSetKernelArg(clear_memory_kernel, 0, sizeof(array), &array);
|
|
if (err != CL_SUCCESS) {
|
|
print_error( err, "Failed to set args for clear_memory_kernel to clear the memory between runs");
|
|
return -4;
|
|
}
|
|
size_t global[3] = {1,0,0};
|
|
global[0] = (cl_uint)(memory_size/sizeof(cl_uint));
|
|
size_t local[3] = {1,0,0};
|
|
local[0] = max_workgroup_size_for_clear_kernel;
|
|
while( global[0] % local[0] ) //make sure that global[0] is evenly divided by local[0]. Will stop at 1 in worst case.
|
|
local[0]--;
|
|
err = clEnqueueNDRangeKernel(queue, clear_memory_kernel, 1, NULL, global, local, 0, NULL, NULL);
|
|
if (err != CL_SUCCESS) {
|
|
print_error( err, "Failed to execute clear_memory_kernel to clear the memory between runs");
|
|
return -4;
|
|
}
|
|
|
|
cl_ulong start_valid_index = start_valid_memory_address/sizeof(cl_uint);
|
|
cl_ulong end_valid_index = end_valid_memory_address/sizeof(cl_uint);
|
|
|
|
cl_uint start_valid_index_int = (cl_uint) start_valid_index;
|
|
cl_uint end_valid_index_int = (cl_uint) end_valid_index;
|
|
|
|
// Set the arguments
|
|
err = clSetKernelArg(kernel, 0, sizeof(array), &array);
|
|
err |= clSetKernelArg(kernel, 1, sizeof(final_x_size), &final_x_size);
|
|
err |= clSetKernelArg(kernel, 2, sizeof(final_y_size), &final_y_size);
|
|
err |= clSetKernelArg(kernel, 3, sizeof(final_z_size), &final_z_size);
|
|
if (gHasLong)
|
|
{
|
|
err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index), &start_valid_index);
|
|
err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index), &end_valid_index);
|
|
}
|
|
else
|
|
{
|
|
err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index_int), &start_valid_index_int);
|
|
err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index_int), &end_valid_index_int);
|
|
}
|
|
|
|
if (err != CL_SUCCESS) {
|
|
print_error( err, "Failed to set arguments.");
|
|
return -3;
|
|
}
|
|
|
|
|
|
// Execute the kernel
|
|
if (explict_local == 0) {
|
|
err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, NULL, 0, NULL, NULL);
|
|
if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, NULL local, %d dim, start address %llu, end address %llu.\n",
|
|
print_dimensions(global_size[0], global_size[1], global_size[2], dimensions),
|
|
dimensions, start_valid_memory_address, end_valid_memory_address);
|
|
} else {
|
|
err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, local_size, 0, NULL, NULL);
|
|
if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, local %s, %d dim, start address %llu, end address %llu.\n",
|
|
print_dimensions(global_size[0], global_size[1], global_size[2], dimensions), print_dimensions2(local_size[0], local_size[1], local_size[2], dimensions),
|
|
dimensions, start_valid_memory_address, end_valid_memory_address);
|
|
}
|
|
if (err == CL_OUT_OF_RESOURCES) {
|
|
log_info("WARNING: kernel reported CL_OUT_OF_RESOURCES, indicating the global dimensions are too large. Skipping this size.\n");
|
|
return 0;
|
|
}
|
|
if (err != CL_SUCCESS) {
|
|
print_error( err, "Failed to execute kernel\n");
|
|
return -3;
|
|
}
|
|
|
|
void* mapped = clEnqueueMapBuffer(queue, array, CL_TRUE, CL_MAP_READ, 0, memory_size, 0, NULL, NULL, &err );
|
|
if (err != CL_SUCCESS) {
|
|
print_error( err, "Failed to map results\n");
|
|
return -4;
|
|
}
|
|
cl_uint* data = (cl_uint*)mapped;
|
|
|
|
// Verify the data
|
|
cl_uint i;
|
|
cl_uint last_address = (cl_uint)(end_valid_memory_address - start_valid_memory_address)/(cl_uint)sizeof(cl_uint);
|
|
for (i=0; i<last_address; i++) {
|
|
if (i < last_address) {
|
|
if (data[i] != 1) {
|
|
errors++;
|
|
// log_info("%d expected 1 got %d\n", i, data[i]);
|
|
}
|
|
} else {
|
|
if (data[i] != 0) {
|
|
errors++;
|
|
log_info("%d expected 0 got %d\n", i, data[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
err = clEnqueueUnmapMemObject(queue, array, mapped, 0, NULL, NULL );
|
|
if (err != CL_SUCCESS) {
|
|
print_error( err, "Failed to unmap results\n");
|
|
return -4;
|
|
}
|
|
|
|
err = clFlush(queue);
|
|
if (err != CL_SUCCESS) {
|
|
print_error( err, "Failed to flush\n");
|
|
return -4;
|
|
}
|
|
|
|
// Increment the addresses
|
|
if (end_valid_memory_address == last_memory_address)
|
|
break;
|
|
start_valid_memory_address += memory_size;
|
|
end_valid_memory_address += memory_size;
|
|
if (end_valid_memory_address > last_memory_address)
|
|
end_valid_memory_address = last_memory_address;
|
|
}
|
|
|
|
if (errors)
|
|
log_error("%d errors.\n", errors);
|
|
return errors;
|
|
}
|
|
|
|
|
|
|
|
|
|
static cl_uint max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1;
|
|
|
|
static void set_min(cl_uint *x, cl_uint *y, cl_uint *z) {
|
|
if (*x < min_x_size)
|
|
*x = min_x_size;
|
|
if (*y < min_y_size)
|
|
*y = min_y_size;
|
|
if (*z < min_z_size)
|
|
*z = min_z_size;
|
|
if (*x > max_x_size)
|
|
*x = max_x_size;
|
|
if (*y > max_y_size)
|
|
*y = max_y_size;
|
|
if (*z > max_z_size)
|
|
*z = max_z_size;
|
|
}
|
|
|
|
|
|
int
|
|
test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue queue, cl_uint dimensions, cl_uint min_dim, cl_uint max_dim, cl_uint quick_test, cl_uint size_increase_per_iteration, int explicit_local) {
|
|
cl_mem array;
|
|
cl_program program;
|
|
cl_kernel kernel;
|
|
int err;
|
|
cl_uint memory_size, max_memory_size;
|
|
size_t max_local_workgroup_size[3];
|
|
cl_uint device_max_dimensions;
|
|
int use_atomics = 1;
|
|
MTdata d;
|
|
|
|
// Unconditionally test larger sizes for CL 1.1
|
|
log_info("Testing large global dimensions.\n");
|
|
limit_size = 0;
|
|
|
|
/* Check if atomics are supported. */
|
|
if (!is_extension_available(device, "cl_khr_global_int32_base_atomics")) {
|
|
log_info("WARNING: Base atomics not supported (cl_khr_global_int32_base_atomics). Test will not be guaranteed to catch overlaping thread dimensions.\n");
|
|
use_atomics = 0;
|
|
}
|
|
|
|
if (quick_test)
|
|
log_info("WARNING: Running quick test. This will only test the base dimensions (power of two) and base-1 with all local threads fixed in one dim.\n");
|
|
|
|
// Verify that we can test this many dimensions
|
|
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(device_max_dimensions), &device_max_dimensions, NULL);
|
|
test_error(err, "clGetDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed");
|
|
|
|
if (dimensions > device_max_dimensions) {
|
|
log_info("Can not test %d dimensions when device only supports %d.\n", dimensions, device_max_dimensions);
|
|
return 0;
|
|
}
|
|
|
|
log_info("Setting random seed to 0.\n");
|
|
|
|
if (gHasLong) {
|
|
if (use_atomics) {
|
|
err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_long, "test_thread_dimension_atomic" );
|
|
} else {
|
|
err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_long, "test_thread_dimension_not_atomic" );
|
|
}
|
|
} else {
|
|
if (use_atomics) {
|
|
err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_not_long, "test_thread_dimension_atomic" );
|
|
} else {
|
|
err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_not_long, "test_thread_dimension_not_atomic" );
|
|
}
|
|
}
|
|
test_error( err, "Unable to create testing kernel" );
|
|
|
|
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
|
|
test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
|
|
|
|
clear_memory_kernel = clCreateKernel(program, "clear_memory", &err);
|
|
if (err)
|
|
{
|
|
log_error("clCreateKernel failed: %d\n", err);
|
|
return -1;
|
|
}
|
|
|
|
// Get the maximum sizes supported by this device
|
|
size_t max_workgroup_size = 0;
|
|
size_t max_width = 0;
|
|
size_t max_height = 0;
|
|
cl_ulong max_allocation = 0;
|
|
cl_ulong max_physical = 0;
|
|
int found_size = 0;
|
|
|
|
err = get_maximums(kernel, context,
|
|
&max_workgroup_size, &max_width, &max_height, &max_allocation, &max_physical);
|
|
|
|
err = get_maximums(clear_memory_kernel, context,
|
|
&max_workgroup_size_for_clear_kernel, &max_width, &max_height, &max_allocation, &max_physical);
|
|
|
|
// Make sure we don't try to allocate more than half the physical memory present.
|
|
if (max_allocation > (max_physical/2)) {
|
|
log_info("Limiting max allocation to half of the maximum physical memory (%gMB of %gMB physical).\n",
|
|
(max_physical/2/(1024.0*1024.0)), (max_physical/(1024.0*1024.0)));
|
|
max_allocation = max_physical/2;
|
|
}
|
|
|
|
// Limit the maximum we'll allocate for this test to 512 to be reasonable.
|
|
if (max_allocation > 1024*1024*512) {
|
|
log_info("Limiting max allocation to 512MB from device maximum allocation of %gMB.\n", (max_allocation/1024.0/1024.0));
|
|
max_allocation = 1024*1024*512;
|
|
}
|
|
|
|
max_memory_size = (cl_uint)(max_allocation);
|
|
if (max_memory_size > 512*1024*1024)
|
|
max_memory_size = 512*1024*1024;
|
|
memory_size = max_memory_size;
|
|
|
|
log_info("Memory allocation size to use is %gMB, max workgroup size is %d.\n", max_memory_size/(1024.0*1024.0), (int)max_workgroup_size);
|
|
|
|
while (!found_size && memory_size >= max_memory_size/8) {
|
|
array = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), memory_size, NULL, &err);
|
|
if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE || err == CL_OUT_OF_HOST_MEMORY) {
|
|
memory_size -= max_memory_size/16;
|
|
continue;
|
|
}
|
|
if (err) {
|
|
print_error( err, "clCreateBuffer failed");
|
|
return -1;
|
|
}
|
|
found_size = 1;
|
|
}
|
|
|
|
if (!found_size) {
|
|
log_error("Failed to find a working size greater than 1/8th of the reported allocation size.\n");
|
|
return -1;
|
|
}
|
|
|
|
if (memory_size < max_memory_size) {
|
|
log_info("Note: failed to allocate %gMB, using %gMB instead.\n", max_memory_size/(1024.0*1024.0), memory_size/(1024.0*1024.0));
|
|
}
|
|
|
|
int errors = 0;
|
|
// Each dimension's size is multiplied by this amount on each iteration.
|
|
// uint size_increase_per_iteration = 4;
|
|
// 1 test at the specified size
|
|
// 2 tests with each dimensions +/- 1
|
|
// 2 tests with all dimensions +/- 1
|
|
// 2 random tests
|
|
cl_uint tests_per_size = 1 + 2*dimensions + 2 + 2;
|
|
|
|
// 1 test with 1 as the local threads in each dimensions
|
|
// 1 test with all the local threads in each dimension
|
|
// 2 random tests
|
|
cl_uint local_tests_per_size = 1 + dimensions + 2;
|
|
if (explicit_local == 0)
|
|
local_tests_per_size = 1;
|
|
|
|
max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1;
|
|
|
|
if (dimensions > 3) {
|
|
log_error("Invalid dimensions: %d\n", dimensions);
|
|
return -1;
|
|
}
|
|
max_x_size = max_dim;
|
|
min_x_size = min_dim;
|
|
if (dimensions > 1) {
|
|
max_y_size = max_dim;
|
|
min_y_size = min_dim;
|
|
}
|
|
if (dimensions > 2) {
|
|
max_z_size = max_dim;
|
|
min_z_size = min_dim;
|
|
}
|
|
|
|
log_info("Testing with dimensions up to %s.\n", print_dimensions(max_x_size, max_y_size, max_z_size, dimensions));
|
|
cl_uint x_size, y_size, z_size;
|
|
|
|
d = init_genrand( gRandomSeed );
|
|
z_size = min_z_size;
|
|
while (z_size <= max_z_size) {
|
|
y_size = min_y_size;
|
|
while (y_size <= max_y_size) {
|
|
x_size = min_x_size;
|
|
while (x_size <= max_x_size) {
|
|
|
|
log_info("Base test size %s:\n", print_dimensions(x_size, y_size, z_size, dimensions));
|
|
|
|
cl_uint sub_test;
|
|
cl_uint final_x_size, final_y_size, final_z_size;
|
|
for (sub_test = 0; sub_test < tests_per_size; sub_test++) {
|
|
final_x_size = x_size;
|
|
final_y_size = y_size;
|
|
final_z_size = z_size;
|
|
|
|
if (sub_test == 0) {
|
|
if (DEBUG) log_info("\tTesting with base dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
|
|
} else if (quick_test) {
|
|
// If we are in quick mode just do 1 run with x-1, y-1, and z-1.
|
|
if (sub_test > 1)
|
|
break;
|
|
final_x_size--;
|
|
final_y_size--;
|
|
final_z_size--;
|
|
set_min(&final_x_size, &final_y_size, &final_z_size);
|
|
if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
|
|
} else if (sub_test <= dimensions*2) {
|
|
int dim_to_change = (sub_test-1)%dimensions;
|
|
//log_info ("dim_to_change: %d (sub_test:%d) dimensions %d\n", dim_to_change,sub_test, dimensions);
|
|
int up_down = (sub_test > dimensions) ? 0 : 1;
|
|
|
|
if (dim_to_change == 0) {
|
|
final_x_size += (up_down) ? -1 : +1;
|
|
} else if (dim_to_change == 1) {
|
|
final_y_size += (up_down) ? -1 : +1;
|
|
} else if (dim_to_change == 2) {
|
|
final_z_size += (up_down) ? -1 : +1;
|
|
} else {
|
|
log_error("Invalid dim_to_change: %d\n", dim_to_change);
|
|
return -1;
|
|
}
|
|
set_min(&final_x_size, &final_y_size, &final_z_size);
|
|
if (DEBUG) log_info("\tTesting with one base dimension +/- 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
|
|
} else if (sub_test == (dimensions*2+1)) {
|
|
if (dimensions == 1)
|
|
continue;
|
|
final_x_size--;
|
|
final_y_size--;
|
|
final_z_size--;
|
|
set_min(&final_x_size, &final_y_size, &final_z_size);
|
|
if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
|
|
} else if (sub_test == (dimensions*2+2)) {
|
|
if (dimensions == 1)
|
|
continue;
|
|
final_x_size++;
|
|
final_y_size++;
|
|
final_z_size++;
|
|
set_min(&final_x_size, &final_y_size, &final_z_size);
|
|
if (DEBUG) log_info("\tTesting with all base dimensions + 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
|
|
} else {
|
|
final_x_size = (int)get_random_float(0, (x_size/size_increase_per_iteration), d)+x_size/size_increase_per_iteration;
|
|
final_y_size = (int)get_random_float(0, (y_size/size_increase_per_iteration), d)+y_size/size_increase_per_iteration;
|
|
final_z_size = (int)get_random_float(0, (z_size/size_increase_per_iteration), d)+z_size/size_increase_per_iteration;
|
|
set_min(&final_x_size, &final_y_size, &final_z_size);
|
|
if (DEBUG) log_info("\tTesting with random dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
|
|
}
|
|
|
|
if (limit_size && final_x_size*final_y_size*final_z_size >= MAX_TOTAL_GLOBAL_THREADS_FOR_TEST) {
|
|
log_info("Skipping size %s as it exceeds max test threads of %d.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions), MAX_TOTAL_GLOBAL_THREADS_FOR_TEST);
|
|
continue;
|
|
}
|
|
|
|
cl_uint local_test;
|
|
cl_uint local_x_size, local_y_size, local_z_size;
|
|
cl_uint previous_local_x_size=0, previous_local_y_size=0, previous_local_z_size=0;
|
|
for (local_test = 0; local_test < local_tests_per_size; local_test++) {
|
|
|
|
local_x_size = 1;
|
|
local_y_size = 1;
|
|
local_z_size = 1;
|
|
|
|
if (local_test == 0) {
|
|
} else if (local_test <= dimensions) {
|
|
int dim_to_change = (local_test-1)%dimensions;
|
|
if (dim_to_change == 0) {
|
|
local_x_size = (cl_uint)max_workgroup_size;
|
|
} else if (dim_to_change == 1) {
|
|
local_y_size = (cl_uint)max_workgroup_size;
|
|
} else if (dim_to_change == 2) {
|
|
local_z_size = (cl_uint)max_workgroup_size;
|
|
} else {
|
|
log_error("Invalid dim_to_change: %d\n", dim_to_change);
|
|
free_mtdata(d);
|
|
return -1;
|
|
}
|
|
} else {
|
|
local_x_size = (int)get_random_float(1, (int)max_workgroup_size, d);
|
|
while ((local_x_size > 1) && (final_x_size%local_x_size != 0))
|
|
local_x_size--;
|
|
int remainder = (int)floor((double)max_workgroup_size/local_x_size);
|
|
// Evenly prefer dimensions 2 and 1 first
|
|
if (local_test % 2) {
|
|
if (dimensions > 1) {
|
|
local_y_size = (int)get_random_float(1, (int)remainder, d);
|
|
while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
|
|
local_y_size--;
|
|
remainder = (int)floor((double)remainder/local_y_size);
|
|
}
|
|
if (dimensions > 2) {
|
|
local_z_size = (int)get_random_float(1, (int)remainder, d);
|
|
while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
|
|
local_z_size--;
|
|
}
|
|
} else {
|
|
if (dimensions > 2) {
|
|
local_z_size = (int)get_random_float(1, (int)remainder, d);
|
|
while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
|
|
local_z_size--;
|
|
remainder = (int)floor((double)remainder/local_z_size);
|
|
}
|
|
if (dimensions > 1) {
|
|
local_y_size = (int)get_random_float(1, (int)remainder, d);
|
|
while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
|
|
local_y_size--;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Put all the threads in one dimension to speed up the test in quick mode.
|
|
if (quick_test) {
|
|
local_y_size = 1;
|
|
local_z_size = 1;
|
|
local_x_size = 1;
|
|
if (final_z_size > final_y_size && final_z_size > final_x_size)
|
|
local_z_size = (cl_uint)max_workgroup_size;
|
|
else if (final_y_size > final_x_size)
|
|
local_y_size = (cl_uint)max_workgroup_size;
|
|
else
|
|
local_x_size = (cl_uint)max_workgroup_size;
|
|
}
|
|
|
|
if (local_x_size > max_local_workgroup_size[0])
|
|
local_x_size = (int)max_local_workgroup_size[0];
|
|
if (dimensions > 1 && local_y_size > max_local_workgroup_size[1])
|
|
local_y_size = (int)max_local_workgroup_size[1];
|
|
if (dimensions > 2 && local_z_size > max_local_workgroup_size[2])
|
|
local_z_size = (int)max_local_workgroup_size[2];
|
|
|
|
// Cleanup the local dimensions
|
|
while ((local_x_size > 1) && (final_x_size%local_x_size != 0))
|
|
local_x_size--;
|
|
while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
|
|
local_y_size--;
|
|
while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
|
|
local_z_size--;
|
|
if ((previous_local_x_size == local_x_size) && (previous_local_y_size == local_y_size) && (previous_local_z_size == local_z_size))
|
|
continue;
|
|
|
|
if (explicit_local == 0) {
|
|
local_x_size = 0;
|
|
local_y_size = 0;
|
|
local_z_size = 0;
|
|
}
|
|
|
|
if (DEBUG) log_info("\t\tTesting local size %s.\n", print_dimensions(local_x_size, local_y_size, local_z_size, dimensions));
|
|
|
|
if (explicit_local == 0) {
|
|
log_info("\tTesting global %s local [NULL]...\n",
|
|
print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
|
|
} else {
|
|
log_info("\tTesting global %s local %s...\n",
|
|
print_dimensions(final_x_size, final_y_size, final_z_size, dimensions),
|
|
print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions));
|
|
}
|
|
|
|
// Avoid running with very small local sizes on very large global sizes
|
|
cl_uint total_local_size = local_x_size * local_y_size * local_z_size;
|
|
long total_global_size = final_x_size * final_y_size * final_z_size;
|
|
if (total_local_size < max_workgroup_size) {
|
|
if (total_global_size > 16384*16384) {
|
|
if (total_local_size < 64) {
|
|
log_info("Skipping test as local_size is small and it will take a long time.\n");
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
err = run_test(context, queue, kernel, array, memory_size, dimensions,
|
|
final_x_size, final_y_size, final_z_size,
|
|
local_x_size, local_y_size, local_z_size, explicit_local);
|
|
|
|
// If we failed to execute, then return so we don't crash.
|
|
if (err < 0) {
|
|
clReleaseMemObject(array);
|
|
clReleaseKernel(kernel);
|
|
clReleaseProgram(program);
|
|
free_mtdata(d);
|
|
return -1;
|
|
}
|
|
|
|
// Otherwise, if we had errors add them up.
|
|
if (err) {
|
|
log_error("Test global %s local %s failed.\n",
|
|
print_dimensions(final_x_size, final_y_size, final_z_size, dimensions),
|
|
print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions));
|
|
errors++;
|
|
clReleaseMemObject(array);
|
|
clReleaseKernel(kernel);
|
|
clReleaseKernel(clear_memory_kernel);
|
|
clReleaseProgram(program);
|
|
free_mtdata(d);
|
|
return -1;
|
|
}
|
|
|
|
|
|
previous_local_x_size = local_x_size;
|
|
previous_local_y_size = local_y_size;
|
|
previous_local_z_size = local_z_size;
|
|
|
|
// Only test one config in quick mode.
|
|
if (quick_test)
|
|
break;
|
|
} // local_test size
|
|
} // sub_test
|
|
// Increment the x_size
|
|
if (x_size == max_x_size)
|
|
break;
|
|
x_size *= size_increase_per_iteration;
|
|
if (x_size > max_x_size)
|
|
x_size = max_x_size;
|
|
} // x_size
|
|
// Increment the y_size
|
|
if (y_size == max_y_size)
|
|
break;
|
|
y_size *= size_increase_per_iteration;
|
|
if (y_size > max_y_size)
|
|
y_size = max_y_size;
|
|
} // y_size
|
|
// Increment the z_size
|
|
if (z_size == max_z_size)
|
|
break;
|
|
z_size *= size_increase_per_iteration;
|
|
if (z_size > max_z_size)
|
|
z_size = max_z_size;
|
|
} // z_size
|
|
|
|
|
|
free_mtdata(d);
|
|
clReleaseMemObject(array);
|
|
clReleaseKernel(kernel);
|
|
clReleaseKernel(clear_memory_kernel);
|
|
clReleaseProgram(program);
|
|
if (errors)
|
|
log_error("%d total errors.\n", errors);
|
|
return errors;
|
|
|
|
}
|
|
|
|
#define QUICK 1
|
|
#define FULL 0
|
|
|
|
int
|
|
test_quick_thread_dimensions_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
1, 1, 65536*512, QUICK, 4, 1);
|
|
}
|
|
|
|
|
|
int
|
|
test_quick_thread_dimensions_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
2, 1, 65536/4, QUICK, 16, 1);
|
|
}
|
|
|
|
int
|
|
test_quick_thread_dimensions_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
3, 1, 1024, QUICK, 32, 1);
|
|
}
|
|
|
|
|
|
int
|
|
test_quick_thread_dimensions_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
1, 1, 65536*256, QUICK, 4, 0);
|
|
}
|
|
|
|
|
|
int
|
|
test_quick_thread_dimensions_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
2, 1, 65536/4, QUICK, 16, 0);
|
|
}
|
|
|
|
int
|
|
test_quick_thread_dimensions_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
3, 1, 1024, QUICK, 32, 0);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int
|
|
test_full_thread_dimensions_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
1, 1, 65536*512, FULL, 4, 1);
|
|
}
|
|
|
|
|
|
int
|
|
test_full_thread_dimensions_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
2, 1, 65536/4, FULL, 16, 1);
|
|
}
|
|
|
|
int
|
|
test_full_thread_dimensions_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
3, 1, 1024, FULL, 32, 1);
|
|
}
|
|
|
|
|
|
int
|
|
test_full_thread_dimensions_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
1, 1, 65536*256, FULL, 4, 0);
|
|
}
|
|
|
|
|
|
int
|
|
test_full_thread_dimensions_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
2, 1, 65536/4, FULL, 16, 0);
|
|
}
|
|
|
|
int
|
|
test_full_thread_dimensions_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
|
|
return test_thread_dimensions(deviceID, context, queue,
|
|
3, 1, 1024, FULL, 32, 0);
|
|
}
|
|
|