mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
The code base uses a mix of 'device' and 'deviceID'. I suggest we standardise on 'device' which is shorter and slightly more prevalent. Contributes to #2181 Signed-off-by: Kevin Petit <kevin.petit@arm.com>
163 lines
6.2 KiB
C++
163 lines
6.2 KiB
C++
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include "common.h"
|
|
|
|
const char *byte_manipulation_kernels[] = {
|
|
// Each device will write it's id into the bytes that it "owns", ownership is based on round robin (global_id % num_id)
|
|
// num_id is equal to number of SVM devices in the system plus one (for the host code).
|
|
// id is the index (id) of the device that this kernel is executing on.
|
|
// For example, if there are 2 SVM devices and the host; the buffer should look like this after each device and the host write their id's:
|
|
// 0, 1, 2, 0, 1, 2, 0, 1, 2...
|
|
"__kernel void write_owned_locations(__global char* a, uint num_id, uint id)\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" int owner = i % num_id;\n"
|
|
" if(id == owner) \n"
|
|
" a[i] = id;\n" // modify location if it belongs to this device, write id
|
|
"}\n"
|
|
|
|
// Verify that a device can see the byte sized updates from the other devices, sum up the device id's and see if they match expected value.
|
|
// Note: this must be called with a reduced NDRange so that neighbor acesses don't go past end of buffer.
|
|
// For example if there are two SVM devices and the host (3 total devices) the buffer should look like this:
|
|
// 0,1,2,0,1,2...
|
|
// and the expected sum at each point is 0+1+2 = 3.
|
|
"__kernel void sum_neighbor_locations(__global char* a, uint num_devices, volatile __global uint* error_count)\n"
|
|
"{\n"
|
|
" size_t i = get_global_id(0);\n"
|
|
" uint expected_sum = (num_devices * (num_devices - 1))/2;\n"
|
|
" uint sum = 0;\n"
|
|
" for(uint j=0; j<num_devices; j++) {\n"
|
|
" sum += a[i + j];\n" // add my neighbors to the right
|
|
" }\n"
|
|
" if(sum != expected_sum)\n"
|
|
" atomic_inc(error_count);\n"
|
|
"}\n"
|
|
};
|
|
|
|
|
|
REGISTER_TEST(svm_byte_granularity)
|
|
{
|
|
clContextWrapper contextWrapper;
|
|
clProgramWrapper program;
|
|
clKernelWrapper k1, k2;
|
|
clCommandQueueWrapper queues[MAXQ];
|
|
|
|
cl_uint num_devices = 0;
|
|
cl_int err = CL_SUCCESS;
|
|
|
|
err = create_cl_objects(device, &byte_manipulation_kernels[0],
|
|
&contextWrapper, &program, &queues[0], &num_devices,
|
|
CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
|
|
context = contextWrapper;
|
|
if (err == 1)
|
|
return 0; // no devices capable of requested SVM level, so don't execute
|
|
// but count test as passing.
|
|
if (err < 0) return -1; // fail test.
|
|
|
|
cl_uint num_devices_plus_host = num_devices + 1;
|
|
|
|
k1 = clCreateKernel(program, "write_owned_locations", &err);
|
|
test_error(err, "clCreateKernel failed");
|
|
k2 = clCreateKernel(program, "sum_neighbor_locations", &err);
|
|
test_error(err, "clCreateKernel failed");
|
|
|
|
|
|
cl_char *pA = (cl_char *)clSVMAlloc(
|
|
context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
|
|
sizeof(cl_char) * num_elements, 0);
|
|
|
|
cl_uint **error_counts = (cl_uint **)malloc(sizeof(void *) * num_devices);
|
|
|
|
for (cl_uint i = 0; i < num_devices; i++)
|
|
{
|
|
error_counts[i] = (cl_uint *)clSVMAlloc(
|
|
context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
|
|
sizeof(cl_uint), 0);
|
|
*error_counts[i] = 0;
|
|
}
|
|
for (int i = 0; i < num_elements; i++) pA[i] = -1;
|
|
|
|
err |= clSetKernelArgSVMPointer(k1, 0, pA);
|
|
err |= clSetKernelArg(k1, 1, sizeof(cl_uint), &num_devices_plus_host);
|
|
test_error(err, "clSetKernelArg failed");
|
|
|
|
// get all the devices going simultaneously
|
|
size_t element_num = num_elements;
|
|
for (cl_uint d = 0; d < num_devices; d++) // device ids starting at 1.
|
|
{
|
|
err = clSetKernelArg(k1, 2, sizeof(cl_uint), &d);
|
|
test_error(err, "clSetKernelArg failed");
|
|
err = clEnqueueNDRangeKernel(queues[d], k1, 1, NULL, &element_num, NULL,
|
|
0, NULL, NULL);
|
|
test_error(err, "clEnqueueNDRangeKernel failed");
|
|
}
|
|
|
|
for (cl_uint d = 0; d < num_devices; d++) clFlush(queues[d]);
|
|
|
|
cl_uint host_id =
|
|
num_devices; // host code will take the id above the devices.
|
|
for (int i = (int)num_devices; i < num_elements; i += num_devices_plus_host)
|
|
pA[i] = host_id;
|
|
|
|
for (cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
|
|
|
|
// now check that each device can see the byte writes made by the other
|
|
// devices.
|
|
|
|
err |= clSetKernelArgSVMPointer(k2, 0, pA);
|
|
err |= clSetKernelArg(k2, 1, sizeof(cl_uint), &num_devices_plus_host);
|
|
test_error(err, "clSetKernelArg failed");
|
|
|
|
// adjusted so k2 doesn't read past end of buffer
|
|
size_t adjusted_num_elements = num_elements - num_devices;
|
|
for (cl_uint id = 0; id < num_devices; id++)
|
|
{
|
|
err = clSetKernelArgSVMPointer(k2, 2, error_counts[id]);
|
|
test_error(err, "clSetKernelArg failed");
|
|
|
|
err =
|
|
clEnqueueNDRangeKernel(queues[id], k2, 1, NULL,
|
|
&adjusted_num_elements, NULL, 0, NULL, NULL);
|
|
test_error(err, "clEnqueueNDRangeKernel failed");
|
|
}
|
|
|
|
for (cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
|
|
|
|
bool failed = false;
|
|
|
|
// see if any of the devices found errors
|
|
for (cl_uint i = 0; i < num_devices; i++)
|
|
{
|
|
if (*error_counts[i] > 0) failed = true;
|
|
}
|
|
cl_uint expected =
|
|
(num_devices_plus_host * (num_devices_plus_host - 1)) / 2;
|
|
// check that host can see the byte writes made by the devices.
|
|
for (cl_uint i = 0; i < num_elements - num_devices_plus_host; i++)
|
|
{
|
|
int sum = 0;
|
|
for (cl_uint j = 0; j < num_devices_plus_host; j++) sum += pA[i + j];
|
|
if (sum != expected) failed = true;
|
|
}
|
|
|
|
clSVMFree(context, pA);
|
|
for (cl_uint i = 0; i < num_devices; i++)
|
|
clSVMFree(context, error_counts[i]);
|
|
|
|
if (failed) return -1;
|
|
return 0;
|
|
}
|