mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-21 23:09:01 +00:00
Initial open source release of OpenCL 2.2 CTS.
This commit is contained in:
12
test_conformance/clcpp/synchronization/CMakeLists.txt
Normal file
12
test_conformance/clcpp/synchronization/CMakeLists.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
set(MODULE_NAME CPP_SYNCHRONIZATION)
|
||||
|
||||
set(${MODULE_NAME}_SOURCES
|
||||
main.cpp
|
||||
../../../test_common/harness/errorHelpers.c
|
||||
../../../test_common/harness/testHarness.c
|
||||
../../../test_common/harness/kernelHelpers.c
|
||||
../../../test_common/harness/msvc9.c
|
||||
../../../test_common/harness/parseParameters.cpp
|
||||
)
|
||||
|
||||
include(../../CMakeCommon.txt)
|
||||
32
test_conformance/clcpp/synchronization/main.cpp
Normal file
32
test_conformance/clcpp/synchronization/main.cpp
Normal file
@@ -0,0 +1,32 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#include "../common.hpp"
|
||||
|
||||
#include "test_work_group_barrier.hpp"
|
||||
#include "test_sub_group_barrier.hpp"
|
||||
#include "named_barrier/test_spec_example.hpp"
|
||||
#include "named_barrier/test_named_barrier.hpp"
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
{
|
||||
// Get list to all test functions
|
||||
std::vector<basefn> testfn_list = autotest::test_suite::get_test_functions();
|
||||
// Get names of all test functions
|
||||
std::vector<std::string> testfn_names = autotest::test_suite::get_test_names();
|
||||
// Create a vector of pointers to the names test functions
|
||||
std::vector<const char *> testfn_names_c_str = autotest::get_strings_ptrs(testfn_names);
|
||||
return runTestHarness(argc, argv, testfn_list.size(), testfn_list.data(), testfn_names_c_str.data(), false, false, 0);
|
||||
}
|
||||
171
test_conformance/clcpp/synchronization/named_barrier/common.hpp
Normal file
171
test_conformance/clcpp/synchronization/named_barrier/common.hpp
Normal file
@@ -0,0 +1,171 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_COMMON_HPP
|
||||
#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_COMMON_HPP
|
||||
|
||||
#include <vector>
|
||||
|
||||
// Common for all OpenCL C++ tests
|
||||
#include "../../common.hpp"
|
||||
#include "../../funcs_test_utils.hpp"
|
||||
|
||||
#define RUN_WG_NAMED_BARRIER_TEST_MACRO(TEST_CLASS) \
|
||||
last_error = run_work_group_named_barrier_barrier_test( \
|
||||
device, context, queue, num_elements, TEST_CLASS \
|
||||
); \
|
||||
CHECK_ERROR(last_error) \
|
||||
error |= last_error;
|
||||
|
||||
namespace named_barrier {
|
||||
|
||||
struct work_group_named_barrier_test_base : public detail::base_func_type<cl_uint>
|
||||
{
|
||||
// Returns test name
|
||||
virtual std::string str() = 0;
|
||||
// Returns OpenCL program source
|
||||
// It's assumed that this program has only one kernel.
|
||||
virtual std::string generate_program() = 0;
|
||||
// Return value that is expected to be in output_buffer[i]
|
||||
virtual cl_uint operator()(size_t i, size_t work_group_size, size_t mas_sub_group_size) = 0;
|
||||
// Kernel execution
|
||||
// This covers typical case: kernel is executed once, kernel
|
||||
// has only one argument which is output buffer
|
||||
virtual cl_int execute(const cl_kernel kernel,
|
||||
const cl_mem output_buffer,
|
||||
const cl_command_queue& queue,
|
||||
const size_t work_size,
|
||||
const size_t work_group_size)
|
||||
{
|
||||
cl_int err;
|
||||
err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
|
||||
RETURN_ON_CL_ERROR(err, "clSetKernelArg")
|
||||
|
||||
err = clEnqueueNDRangeKernel(
|
||||
queue, kernel, 1,
|
||||
NULL, &work_size, &work_group_size,
|
||||
0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
|
||||
return err;
|
||||
}
|
||||
// Calculates maximal work-group size (one dim)
|
||||
virtual size_t get_max_local_size(const cl_kernel kernel,
|
||||
const cl_device_id device,
|
||||
const size_t work_group_size, // default work-group size
|
||||
cl_int& error)
|
||||
{
|
||||
size_t max_wg_size;
|
||||
error = clGetKernelWorkGroupInfo(
|
||||
kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
return (std::min)(work_group_size, max_wg_size);
|
||||
}
|
||||
// if work-groups should be uniform
|
||||
virtual bool enforce_uniform()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <class work_group_named_barrier_test>
|
||||
int run_work_group_named_barrier_barrier_test(cl_device_id device, cl_context context, cl_command_queue queue,
|
||||
size_t count, work_group_named_barrier_test test)
|
||||
{
|
||||
cl_mem buffers[1];
|
||||
cl_program program;
|
||||
cl_kernel kernel;
|
||||
size_t work_group_size;
|
||||
size_t work_size[1];
|
||||
cl_int err;
|
||||
|
||||
std::string code_str = test.generate_program();
|
||||
std::string kernel_name = test.get_kernel_name();
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
// Only OpenCL C++ to SPIR-V compilation
|
||||
#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
|
||||
err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
|
||||
return err;
|
||||
// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
|
||||
#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name, "-cl-std=CL2.0", false);
|
||||
RETURN_ON_ERROR(err)
|
||||
#else
|
||||
err = create_opencl_kernel(context, &program, &kernel, code_str, kernel_name);
|
||||
RETURN_ON_ERROR(err)
|
||||
#endif
|
||||
|
||||
// Find the max possible wg size for among all the kernels
|
||||
work_group_size = test.get_max_local_size(kernel, device, 256, err);
|
||||
RETURN_ON_ERROR(err);
|
||||
if(work_group_size == 0)
|
||||
{
|
||||
log_info("SKIPPED: Can't produce local size with enough sub-groups. Skipping tests.\n");
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
work_size[0] = count;
|
||||
// uniform work-group
|
||||
if(test.enforce_uniform())
|
||||
{
|
||||
size_t wg_number = static_cast<size_t>(
|
||||
std::ceil(static_cast<double>(work_size[0]) / work_group_size)
|
||||
);
|
||||
work_size[0] = wg_number * work_group_size;
|
||||
}
|
||||
|
||||
// host output vector
|
||||
std::vector<cl_uint> output = generate_output<cl_uint>(work_size[0], 9999);
|
||||
|
||||
// device output buffer
|
||||
buffers[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_uint) * output.size(), NULL, &err);
|
||||
RETURN_ON_CL_ERROR(err, "clCreateBuffer")
|
||||
|
||||
// Execute test kernels
|
||||
err = test.execute(kernel, buffers[0], queue, work_size[0], work_group_size);
|
||||
RETURN_ON_ERROR(err)
|
||||
|
||||
err = clEnqueueReadBuffer(
|
||||
queue, buffers[0], CL_TRUE, 0, sizeof(cl_uint) * output.size(),
|
||||
static_cast<void *>(output.data()), 0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(err, "clEnqueueReadBuffer")
|
||||
|
||||
// Check output values
|
||||
for(size_t i = 0; i < output.size(); i++)
|
||||
{
|
||||
cl_uint v = test(i, work_group_size, i);
|
||||
if(!(are_equal(v, output[i], ::detail::make_value<cl_uint>(0), test)))
|
||||
{
|
||||
RETURN_ON_ERROR_MSG(-1,
|
||||
"test_%s(%s) failed. Expected: %s, got: %s", test.str().c_str(), type_name<cl_uint>().c_str(),
|
||||
format_value(v).c_str(), format_value(output[i]).c_str()
|
||||
);
|
||||
}
|
||||
}
|
||||
log_info("test_%s(%s) passed\n", test.str().c_str(), type_name<cl_uint>().c_str());
|
||||
|
||||
clReleaseMemObject(buffers[0]);
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
return err;
|
||||
}
|
||||
|
||||
} // namespace named_barrier
|
||||
|
||||
#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_COMMON_HPP
|
||||
@@ -0,0 +1,490 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_NAMED_BARRIER_HPP
|
||||
#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_NAMED_BARRIER_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace named_barrier {
|
||||
|
||||
struct local_fence_named_barrier_test : public work_group_named_barrier_test_base
|
||||
{
|
||||
std::string str()
|
||||
{
|
||||
return "local_fence";
|
||||
}
|
||||
|
||||
// Return value that is expected to be in output_buffer[i]
|
||||
cl_uint operator()(size_t i, size_t work_group_size, size_t max_sub_group_size)
|
||||
{
|
||||
return static_cast<cl_uint>(i);
|
||||
}
|
||||
|
||||
// At the end every work-item writes its global id to ouput[work-item-global-id].
|
||||
std::string generate_program()
|
||||
{
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
return
|
||||
"__kernel void " + this->get_kernel_name() + "(global uint *output, "
|
||||
"local uint * lmem)\n"
|
||||
"{\n"
|
||||
" size_t gid = get_global_id(0);\n"
|
||||
" output[gid] = gid;\n"
|
||||
"}\n";
|
||||
|
||||
#else
|
||||
return
|
||||
"#define cl_khr_subgroup_named_barrier\n"
|
||||
"#include <opencl_memory>\n"
|
||||
"#include <opencl_work_item>\n"
|
||||
"#include <opencl_synchronization>\n"
|
||||
"using namespace cl;\n"
|
||||
"__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
|
||||
"local_ptr<uint[]> lmem)\n"
|
||||
"{\n\n"
|
||||
" local<work_group_named_barrier> a(1);\n"
|
||||
" local<work_group_named_barrier> b(2);\n"
|
||||
" size_t gid = get_global_id(0);\n"
|
||||
" size_t lid = get_local_id(0);\n"
|
||||
" size_t value;\n"
|
||||
" if(get_num_sub_groups() == 1)\n"
|
||||
" {\n"
|
||||
" size_t other_lid = (lid + 1) % get_enqueued_local_size(0);\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" lmem[other_lid] = other_gid;\n"
|
||||
" a.wait(mem_fence::local);\n"
|
||||
" value = lmem[lid];" // lmem[lid] shoule be equal to gid
|
||||
" }\n"
|
||||
" else if(get_num_sub_groups() == 2)\n"
|
||||
" {\n"
|
||||
" size_t other_lid = (lid + get_max_sub_group_size()) % get_enqueued_local_size(0);\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" lmem[other_lid] = other_gid;\n"
|
||||
" b.wait(mem_fence::local);\n"
|
||||
" value = lmem[lid];" // lmem[lid] shoule be equal to gid
|
||||
" }\n"
|
||||
" else if(get_num_sub_groups() > 2)\n"
|
||||
" {\n"
|
||||
" if(get_sub_group_id() < 2)\n"
|
||||
" {\n"
|
||||
" const size_t two_first_subgroups = 2 * get_max_sub_group_size();"
|
||||
// local and global id of some work-item outside of work-item subgroup,
|
||||
// but within subgroups 0 and 1.
|
||||
" size_t other_lid = (lid + get_max_sub_group_size()) % two_first_subgroups;\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" lmem[other_lid] = other_gid;\n"
|
||||
" b.wait(mem_fence::local);\n" // subgroup 0 and 1 are sync (local)
|
||||
" value = lmem[lid];" // lmem[lid] shoule be equal to gid
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" value = gid;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" output[gid] = value;\n"
|
||||
"}\n";
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t get_max_local_size(const cl_kernel kernel,
|
||||
const cl_device_id device,
|
||||
const size_t work_group_size, // default work-group size
|
||||
cl_int& error)
|
||||
{
|
||||
// Set size of the local memory, we need to to this to correctly calculate
|
||||
// max possible work-group size.
|
||||
size_t wg_size;
|
||||
for(wg_size = work_group_size; wg_size > 1; wg_size /= 2)
|
||||
{
|
||||
error = clSetKernelArg(kernel, 1, wg_size * sizeof(cl_uint), NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
|
||||
size_t max_wg_size;
|
||||
error = clGetKernelWorkGroupInfo(
|
||||
kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
|
||||
if(max_wg_size >= wg_size) break;
|
||||
}
|
||||
return wg_size;
|
||||
}
|
||||
|
||||
cl_int execute(const cl_kernel kernel,
|
||||
const cl_mem output_buffer,
|
||||
const cl_command_queue queue,
|
||||
const size_t work_size,
|
||||
const size_t work_group_size)
|
||||
{
|
||||
cl_int err;
|
||||
// Get context from queue
|
||||
cl_context context;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
|
||||
RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
|
||||
|
||||
err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
|
||||
err |= clSetKernelArg(kernel, 1, work_group_size * sizeof(cl_uint), NULL);
|
||||
RETURN_ON_CL_ERROR(err, "clSetKernelArg")
|
||||
|
||||
err = clEnqueueNDRangeKernel(
|
||||
queue, kernel, 1,
|
||||
NULL, &work_size, &work_group_size,
|
||||
0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
|
||||
|
||||
err = clFinish(queue);
|
||||
return err;
|
||||
}
|
||||
};
|
||||
|
||||
struct global_fence_named_barrier_test : public work_group_named_barrier_test_base
|
||||
{
|
||||
std::string str()
|
||||
{
|
||||
return "global_fence";
|
||||
}
|
||||
|
||||
// Return value that is expected to be in output_buffer[i]
|
||||
cl_uint operator()(size_t i, size_t work_group_size, size_t max_sub_group_size)
|
||||
{
|
||||
return static_cast<cl_uint>(i % work_group_size);
|
||||
}
|
||||
|
||||
// At the end every work-item writes its local id to ouput[work-item-global-id].
|
||||
std::string generate_program()
|
||||
{
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
return
|
||||
"__kernel void " + this->get_kernel_name() + "(global uint * output, "
|
||||
"global uint * temp)\n"
|
||||
"{\n"
|
||||
"size_t gid = get_global_id(0);\n"
|
||||
"output[gid] = get_local_id(0);\n"
|
||||
"}\n";
|
||||
|
||||
#else
|
||||
return
|
||||
"#define cl_khr_subgroup_named_barrier\n"
|
||||
"#include <opencl_memory>\n"
|
||||
"#include <opencl_work_item>\n"
|
||||
"#include <opencl_synchronization>\n"
|
||||
"using namespace cl;\n"
|
||||
"__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
|
||||
"global_ptr<uint[]> temp)\n"
|
||||
"{\n\n"
|
||||
" local<work_group_named_barrier> a(1);\n"
|
||||
" local<work_group_named_barrier> b(2);\n"
|
||||
" size_t gid = get_global_id(0);\n"
|
||||
" size_t lid = get_local_id(0);\n"
|
||||
" size_t value;\n"
|
||||
" if(get_num_sub_groups() == 1)\n"
|
||||
" {\n"
|
||||
" size_t other_lid = (lid + 1) % get_enqueued_local_size(0);\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" temp[other_gid] = other_lid + 1;\n"
|
||||
" a.wait(mem_fence::global);\n"
|
||||
" size_t other_lid_same_subgroup = (lid + 2) % get_sub_group_size();\n"
|
||||
" size_t other_gid_same_subgroup = (gid - lid) + other_lid_same_subgroup;\n"
|
||||
" temp[other_gid_same_subgroup] = temp[other_gid_same_subgroup] - 1;\n"
|
||||
" a.wait(mem_fence::global, memory_scope_sub_group);\n"
|
||||
" value = temp[gid];" // temp[gid] shoule be equal to lid
|
||||
" }\n"
|
||||
" else if(get_num_sub_groups() == 2)\n"
|
||||
" {\n"
|
||||
" size_t other_lid = (lid + get_max_sub_group_size()) % get_enqueued_local_size(0);\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" temp[other_gid] = other_lid + 1;\n"
|
||||
" b.wait(mem_fence::global);\n" // both subgroups wait, both are sync
|
||||
" size_t other_lid_same_subgroup = "
|
||||
"((lid + 1) % get_sub_group_size()) + (get_sub_group_id() * get_sub_group_size());\n"
|
||||
" size_t other_gid_same_subgroup = (gid - lid) + other_lid_same_subgroup;\n"
|
||||
" temp[other_gid_same_subgroup] = temp[other_gid_same_subgroup] - 1;\n"
|
||||
" b.wait(mem_fence::global, memory_scope_sub_group);\n" // both subgroups wait, sync only within subgroup
|
||||
" value = temp[gid];" // temp[gid] shoule be equal to lid
|
||||
" }\n"
|
||||
" else if(get_num_sub_groups() > 2)\n"
|
||||
" {\n"
|
||||
" if(get_sub_group_id() < 2)\n"
|
||||
" {\n"
|
||||
" const size_t two_first_subgroups = 2 * get_max_sub_group_size();"
|
||||
// local and global id of some work-item outside of work-item subgroup,
|
||||
// but within subgroups 0 and 1.
|
||||
" size_t other_lid = (lid + get_max_sub_group_size()) % two_first_subgroups;\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" temp[other_gid] = other_lid + 1;\n"
|
||||
" b.wait(mem_fence::global);\n" // both subgroups wait, both are sync
|
||||
// local and global id of some other work-item within work-item subgroup
|
||||
" size_t other_lid_same_subgroup = "
|
||||
"((lid + 1) % get_sub_group_size()) + (get_sub_group_id() * get_sub_group_size());\n"
|
||||
" size_t other_gid_same_subgroup = (gid - lid) + other_lid_same_subgroup;\n"
|
||||
" temp[other_gid_same_subgroup] = temp[other_gid_same_subgroup] - 1;\n"
|
||||
" b.wait(mem_fence::global, memory_scope_sub_group);\n" // both subgroups wait, sync only within subgroup
|
||||
" value = temp[gid];" // temp[gid] shoule be equal to lid
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" value = lid;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" output[gid] = value;\n"
|
||||
"}\n";
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t get_max_local_size(const cl_kernel kernel,
|
||||
const cl_device_id device,
|
||||
const size_t work_group_size, // default work-group size
|
||||
cl_int& error)
|
||||
{
|
||||
size_t max_wg_size;
|
||||
error = clGetKernelWorkGroupInfo(
|
||||
kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
|
||||
return (std::min)(max_wg_size, work_group_size);
|
||||
}
|
||||
|
||||
cl_int execute(const cl_kernel kernel,
|
||||
const cl_mem output_buffer,
|
||||
const cl_command_queue queue,
|
||||
const size_t work_size,
|
||||
const size_t work_group_size)
|
||||
{
|
||||
cl_int err;
|
||||
// Get context from queue
|
||||
cl_context context;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
|
||||
RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
|
||||
|
||||
// create temp buffer
|
||||
auto temp_buffer = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_uint) * work_size, NULL, &err);
|
||||
RETURN_ON_CL_ERROR(err, "clCreateBuffer")
|
||||
|
||||
err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(temp_buffer), &temp_buffer);
|
||||
RETURN_ON_CL_ERROR(err, "clSetKernelArg")
|
||||
|
||||
err = clEnqueueNDRangeKernel(
|
||||
queue, kernel, 1,
|
||||
NULL, &work_size, &work_group_size,
|
||||
0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
|
||||
|
||||
err = clFinish(queue);
|
||||
err |= clReleaseMemObject(temp_buffer);
|
||||
|
||||
return err;
|
||||
}
|
||||
};
|
||||
|
||||
struct global_local_fence_named_barrier_test : public work_group_named_barrier_test_base
|
||||
{
|
||||
std::string str()
|
||||
{
|
||||
return "global_local_fence";
|
||||
}
|
||||
|
||||
// Return value that is expected to be in output_buffer[i]
|
||||
cl_uint operator()(size_t i, size_t work_group_size, size_t max_sub_group_size)
|
||||
{
|
||||
return static_cast<cl_uint>(i % work_group_size);
|
||||
}
|
||||
|
||||
// At the end every work-item writes its local id to ouput[work-item-global-id].
|
||||
std::string generate_program()
|
||||
{
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
return
|
||||
"__kernel void " + this->get_kernel_name() + "(global uint * output, "
|
||||
"global uint * temp,"
|
||||
"local uint * lmem)\n"
|
||||
"{\n"
|
||||
"size_t gid = get_global_id(0);\n"
|
||||
"output[gid] = get_local_id(0);\n"
|
||||
"}\n";
|
||||
|
||||
#else
|
||||
return
|
||||
"#define cl_khr_subgroup_named_barrier\n"
|
||||
"#include <opencl_memory>\n"
|
||||
"#include <opencl_work_item>\n"
|
||||
"#include <opencl_synchronization>\n"
|
||||
"using namespace cl;\n"
|
||||
"__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
|
||||
"global_ptr<uint[]> temp,"
|
||||
"local_ptr<uint[]> lmem)\n"
|
||||
"{\n\n"
|
||||
" local<work_group_named_barrier> a(1);\n"
|
||||
" local<work_group_named_barrier> b(2);\n"
|
||||
" size_t gid = get_global_id(0);\n"
|
||||
" size_t lid = get_local_id(0);\n"
|
||||
" size_t value = 0;\n"
|
||||
" if(get_num_sub_groups() == 1)\n"
|
||||
" {\n"
|
||||
" size_t other_lid = (lid + 1) % get_enqueued_local_size(0);\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" lmem[other_lid] = other_gid;\n"
|
||||
" temp[other_gid] = other_lid;\n"
|
||||
" a.wait(mem_fence::local | mem_fence::global);\n"
|
||||
" if(lmem[lid] == gid) value = temp[gid];\n"
|
||||
" }\n"
|
||||
" else if(get_num_sub_groups() == 2)\n"
|
||||
" {\n"
|
||||
" size_t other_lid = (lid + get_max_sub_group_size()) % get_enqueued_local_size(0);\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" lmem[other_lid] = other_gid;\n"
|
||||
" temp[other_gid] = other_lid;\n"
|
||||
" b.wait(mem_fence::local | mem_fence::global);\n"
|
||||
" if(lmem[lid] == gid) value = temp[gid];\n"
|
||||
" }\n"
|
||||
" else if(get_num_sub_groups() > 2)\n"
|
||||
" {\n"
|
||||
" if(get_sub_group_id() < 2)\n"
|
||||
" {\n"
|
||||
" const size_t two_first_subgroups = 2 * get_max_sub_group_size();"
|
||||
// local and global id of some work-item outside of work-item subgroup,
|
||||
// but within subgroups 0 and 1.
|
||||
" size_t other_lid = (lid + get_max_sub_group_size()) % two_first_subgroups;\n"
|
||||
" size_t other_gid = (gid - lid) + other_lid;\n"
|
||||
" lmem[other_lid] = other_gid;\n"
|
||||
" temp[other_gid] = other_lid;\n"
|
||||
" b.wait(mem_fence::local | mem_fence::global);\n"
|
||||
" if(lmem[lid] == gid) value = temp[gid];\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" value = lid;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" output[gid] = value;\n"
|
||||
"}\n";
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t get_max_local_size(const cl_kernel kernel,
|
||||
const cl_device_id device,
|
||||
const size_t work_group_size, // default work-group size
|
||||
cl_int& error)
|
||||
{
|
||||
// Set size of the local memory, we need to to this to correctly calculate
|
||||
// max possible work-group size.
|
||||
size_t wg_size;
|
||||
for(wg_size = work_group_size; wg_size > 1; wg_size /= 2)
|
||||
{
|
||||
error = clSetKernelArg(kernel, 2, wg_size * sizeof(cl_uint), NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
|
||||
size_t max_wg_size;
|
||||
error = clGetKernelWorkGroupInfo(
|
||||
kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
|
||||
if(max_wg_size >= wg_size) break;
|
||||
}
|
||||
return wg_size;
|
||||
}
|
||||
|
||||
cl_int execute(const cl_kernel kernel,
|
||||
const cl_mem output_buffer,
|
||||
const cl_command_queue queue,
|
||||
const size_t work_size,
|
||||
const size_t work_group_size)
|
||||
{
|
||||
cl_int err;
|
||||
// Get context from queue
|
||||
cl_context context;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
|
||||
RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
|
||||
|
||||
// create temp buffer
|
||||
auto temp_buffer = clCreateBuffer(
|
||||
context, (cl_mem_flags)(CL_MEM_READ_WRITE),
|
||||
sizeof(cl_uint) * work_size, NULL, &err
|
||||
);
|
||||
RETURN_ON_CL_ERROR(err, "clCreateBuffer")
|
||||
|
||||
err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(temp_buffer), &temp_buffer);
|
||||
err |= clSetKernelArg(kernel, 2, work_group_size * sizeof(cl_uint), NULL);
|
||||
RETURN_ON_CL_ERROR(err, "clSetKernelArg")
|
||||
|
||||
err = clEnqueueNDRangeKernel(
|
||||
queue, kernel, 1,
|
||||
NULL, &work_size, &work_group_size,
|
||||
0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
|
||||
|
||||
err = clFinish(queue);
|
||||
err |= clReleaseMemObject(temp_buffer);
|
||||
|
||||
return err;
|
||||
}
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------
|
||||
// -------------------------- RUN TESTS -----------------------------------------
|
||||
// ------------------------------------------------------------------------------
|
||||
AUTO_TEST_CASE(test_work_group_named_barrier)
|
||||
(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
|
||||
{
|
||||
int error = CL_SUCCESS;
|
||||
int last_error = CL_SUCCESS;
|
||||
|
||||
#if !(defined(DEVELOPMENT) && (defined(USE_OPENCLC_KERNELS) || defined(ONLY_SPIRV_COMPILATION)))
|
||||
if(!is_extension_available(device, "cl_khr_subgroup_named_barrier"))
|
||||
{
|
||||
log_info("SKIPPED: Extension `cl_khr_subgroup_named_barrier` is not supported. Skipping tests.\n");
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
// An implementation shall support at least 8 named barriers per work-group. The exact
|
||||
// maximum number can be queried using clGetDeviceInfo with CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR
|
||||
// from the OpenCL 2.2 Extension Specification.
|
||||
cl_uint named_barrier_count;
|
||||
error = clGetDeviceInfo(device, CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR, sizeof(cl_uint), &named_barrier_count, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
|
||||
|
||||
if(named_barrier_count < 8)
|
||||
{
|
||||
RETURN_ON_ERROR_MSG(-1, "Maximum number of named barriers must be at least 8.");
|
||||
}
|
||||
#endif
|
||||
|
||||
RUN_WG_NAMED_BARRIER_TEST_MACRO(local_fence_named_barrier_test())
|
||||
RUN_WG_NAMED_BARRIER_TEST_MACRO(global_fence_named_barrier_test())
|
||||
RUN_WG_NAMED_BARRIER_TEST_MACRO(global_local_fence_named_barrier_test())
|
||||
|
||||
if(error != CL_SUCCESS)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_NAMED_BARRIER_HPP
|
||||
@@ -0,0 +1,323 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_SPEC_EXAMPLE_HPP
|
||||
#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_SPEC_EXAMPLE_HPP
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
namespace named_barrier {
|
||||
|
||||
// ------------------------------------------------------------------------------
|
||||
// ----------------------- SPECIFICATION EXAMPLE TEST----------------------------
|
||||
// ------------------------------------------------------------------------------
|
||||
// This test is based on the example in OpenCL C++ 1.0 specification (OpenCL C++
|
||||
// Standard Library > Synchronization Functions > Named barriers > wait).
|
||||
struct spec_example_work_group_named_barrier_test : public work_group_named_barrier_test_base
|
||||
{
|
||||
std::string str()
|
||||
{
|
||||
return "spec_example";
|
||||
}
|
||||
|
||||
// Return value that is expected to be in output_buffer[i]
|
||||
cl_uint operator()(size_t i, size_t work_group_size, size_t mas_sub_group_size)
|
||||
{
|
||||
return static_cast<cl_uint>(i);
|
||||
}
|
||||
|
||||
// At the end every work-item writes its global id to ouput[work-item-global-id].
|
||||
std::string generate_program()
|
||||
{
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
return
|
||||
// In OpenCL C kernel we imitate subgroups by partitioning work-group (based on
|
||||
// local ids of work-items), work_group_named_barrier.wait(..) calls are replaced
|
||||
// with work_group_barriers.
|
||||
"__kernel void " + this->get_kernel_name() + "(global uint *output, "
|
||||
"global uint * temp, "
|
||||
"local uint * lmem)\n"
|
||||
"{\n"
|
||||
"size_t gid = get_global_id(0);\n"
|
||||
"size_t lid = get_local_id(0);\n"
|
||||
|
||||
// We divide work-group into ranges:
|
||||
// [0 - e_wg)[ew_g; q_wg)[q_wg; 3 * ew_g)[3 * ew_g; h_wg)[h_wg; get_local_size(0) - 1]
|
||||
// to simulate 8 subgroups
|
||||
"size_t h_wg = get_local_size(0) / 2;\n" // half of work-group
|
||||
"size_t q_wg = get_local_size(0) / 4;\n" // quarter
|
||||
"size_t e_wg = get_local_size(0) / 8;\n" // one-eighth
|
||||
|
||||
"if(lid < h_wg) lmem[lid] = gid;\n" // [0; h_wg)
|
||||
"else temp[gid] = gid;\n" // [h_wg; get_local_size(0) - 1)
|
||||
"work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
|
||||
"size_t other_lid = (lid + q_wg) % h_wg;\n"
|
||||
"size_t value = 0;\n"
|
||||
"if(lmem[other_lid] == ((gid - lid) + other_lid)){\n"
|
||||
" value = gid;\n"
|
||||
"}\n"
|
||||
"work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
|
||||
"if(lid < q_wg){\n" // [0; q_wg)
|
||||
" if(lid < e_wg) lmem[lid + e_wg] = gid;\n" // [0; e_wg)
|
||||
" else lmem[lid - e_wg] = gid;\n" // [e_wg; q_wg)
|
||||
"}\n"
|
||||
"else if(lid < h_wg) {\n" // [q_wg; h_wg)
|
||||
" if(lid < (3 * e_wg)) lmem[lid + e_wg] = gid;\n" // [q_ww; q_wg + e_wg)
|
||||
" else lmem[lid - e_wg] = gid;\n" // [q_wg + e_wg; h_wg)
|
||||
"}\n"
|
||||
"work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
|
||||
"if(lid < q_wg){\n" // [0; q_wg)
|
||||
" output[gid + q_wg] = lmem[lid];\n"
|
||||
"}\n"
|
||||
"else if(lid < h_wg) {\n" // [q_wg; h_wg)
|
||||
" output[gid - q_wg] = lmem[lid];\n"
|
||||
"}\n"
|
||||
"work_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
|
||||
|
||||
"if(lid < q_wg){\n" // [0; q_wg)
|
||||
" if(lid < e_wg) temp[gid] = output[gid + (3 * e_wg)];\n" // [0; e_wg)
|
||||
" else temp[gid] = output[gid + e_wg];\n" // [e_wg; q_wg)
|
||||
"}\n"
|
||||
"else if(lid < h_wg) {\n" // [q_wg; h_wg)
|
||||
" if(lid < (3 * e_wg)) temp[gid] = output[gid - e_wg];\n" // [q_ww; q_wg + e_wg)
|
||||
" else temp[gid] = output[gid - (3 * e_wg)];\n" // [q_wg + e_wg; h_wg)
|
||||
"}\n"
|
||||
"work_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
|
||||
|
||||
"output[gid] = temp[gid];\n"
|
||||
"}\n";
|
||||
|
||||
#else
|
||||
return
|
||||
"#define cl_khr_subgroup_named_barrier\n"
|
||||
"#include <opencl_memory>\n"
|
||||
"#include <opencl_work_item>\n"
|
||||
"#include <opencl_synchronization>\n"
|
||||
"using namespace cl;\n"
|
||||
|
||||
"void b_function(work_group_named_barrier &b, size_t value, local_ptr<uint[]> lmem)\n"
|
||||
"{\n\n"
|
||||
"size_t lid = get_local_id(0);\n"
|
||||
// Work-items from the 1st subgroup writes to local memory that will be
|
||||
// later read byt the 0th subgroup, and the other way around - 0th subgroup
|
||||
// writes what 1st subgroup will later read.
|
||||
// b.wait(mem_fence::local) should provide sync between those two subgroups.
|
||||
"if(get_sub_group_id() < 1) lmem[lid + get_max_sub_group_size()] = value;\n"
|
||||
"else lmem[lid - get_max_sub_group_size()] = value;\n"
|
||||
"b.wait(mem_fence::local);\n\n" // sync writes to lmem for 2 subgroups (ids: 0, 1)
|
||||
"}\n"
|
||||
|
||||
"__kernel void " + this->get_kernel_name() + "(global_ptr<uint[]> output, "
|
||||
"global_ptr<uint[]> temp, "
|
||||
"local_ptr<uint[]> lmem)\n"
|
||||
"{\n\n"
|
||||
"local<work_group_named_barrier> a(4);\n"
|
||||
"local<work_group_named_barrier> b(2);\n"
|
||||
"local<work_group_named_barrier> c(2);\n"
|
||||
|
||||
"size_t gid = get_global_id(0);\n"
|
||||
"size_t lid = get_local_id(0);\n"
|
||||
"if(get_sub_group_id() < 4)"
|
||||
"{\n"
|
||||
" lmem[lid] = gid;\n"
|
||||
" a.wait(mem_fence::local);\n" // sync writes to lmem for 4 subgroups (ids: 0, 1, 2, 3)
|
||||
// Now all four subgroups should see changes in lmem.
|
||||
" size_t other_lid = (lid + (2 * get_max_sub_group_size())) % (4 * get_max_sub_group_size());\n"
|
||||
" size_t value = 0;\n"
|
||||
" if(lmem[other_lid] == ((gid - lid) + other_lid)){\n"
|
||||
" value = gid;\n"
|
||||
" }\n"
|
||||
" a.wait(mem_fence::local);\n" // sync reads from lmem for 4 subgroups (ids: 0, 1, 2, 3)
|
||||
|
||||
" if(get_sub_group_id() < 2)" // ids: 0, 1
|
||||
" {\n"
|
||||
" b_function(b, value, lmem);\n"
|
||||
" }\n"
|
||||
" else" // ids: 2, 3
|
||||
" {\n"
|
||||
// Work-items from the 2nd subgroup writes to local memory that will be
|
||||
// later read byt the 3rd subgroup, and the other way around - 3rd subgroup
|
||||
// writes what 2nd subgroup will later read.
|
||||
// c.wait(mem_fence::local) should provide sync between those two subgroups.
|
||||
" if(get_sub_group_id() < 3) lmem[lid + get_max_sub_group_size()] = value ;\n"
|
||||
" else lmem[lid - get_max_sub_group_size()] = value;\n"
|
||||
" c.wait(mem_fence::local);\n" // sync writes to lmem for 2 subgroups (3, 4)
|
||||
" }\n"
|
||||
|
||||
// Now (0, 1) are in sync (local mem), and (3, 4) are in sync (local mem).
|
||||
// However, subgroups (0, 1) are not in sync with (3, 4).
|
||||
" if(get_sub_group_id() < 4) {\n" // ids: 0, 1, 2, 3
|
||||
" if(get_sub_group_id() < 2) output[gid + (2 * get_max_sub_group_size())] = lmem[lid];\n"
|
||||
" else output[gid - (2 * get_max_sub_group_size())] = lmem[lid];\n"
|
||||
" a.wait(mem_fence::global);\n" // sync writes to global memory (output)
|
||||
// for 4 subgroups (0, 1, 2, 3)
|
||||
" }\n"
|
||||
"}\n"
|
||||
"else {\n" // subgroups with id > 4
|
||||
" temp[gid] = gid;\n"
|
||||
"}\n"
|
||||
|
||||
// Now (0, 1, 2, 3) are in sync (global mem)
|
||||
"if(get_sub_group_id() < 2) {\n"
|
||||
" if(get_sub_group_id() < 1) temp[gid] = output[gid + (3 * get_max_sub_group_size())];\n"
|
||||
" else temp[gid] = output[gid + (get_max_sub_group_size())];\n"
|
||||
"}\n"
|
||||
"else if(get_sub_group_id() < 4) {\n"
|
||||
" if(get_sub_group_id() < 3) temp[gid] = output[gid - (get_max_sub_group_size())];\n"
|
||||
" else temp[gid] = output[gid - (3 * get_max_sub_group_size())];\n"
|
||||
"}\n"
|
||||
|
||||
// Synchronize the entire work-group (in terms of accesses to global memory)
|
||||
"work_group_barrier(mem_fence::global);\n"
|
||||
"output[gid] = temp[gid];\n\n"
|
||||
"}\n";
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t get_max_local_size(const cl_kernel kernel,
|
||||
const cl_device_id device,
|
||||
const size_t work_group_size, // default work-group size
|
||||
cl_int& error)
|
||||
{
|
||||
// Set size of the local memory, we need to to this to correctly calculate
|
||||
// max possible work-group size.
|
||||
size_t wg_size;
|
||||
for(wg_size = work_group_size; wg_size > 1; wg_size /= 2)
|
||||
{
|
||||
error = clSetKernelArg(kernel, 2, ((wg_size / 2) + 1) * sizeof(cl_uint), NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
|
||||
size_t max_wg_size;
|
||||
error = clGetKernelWorkGroupInfo(
|
||||
kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
if(max_wg_size >= wg_size) break;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
// make sure wg_size is a multiple of 8
|
||||
if(wg_size % 8 > 0) wg_size -= (wg_size % 8);
|
||||
return wg_size;
|
||||
#else
|
||||
// make sure that wg_size will produce at least min_num_sub_groups
|
||||
// subgroups in each work-group
|
||||
size_t local_size[3] = { 1, 1, 1 };
|
||||
size_t min_num_sub_groups = 8;
|
||||
error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT,
|
||||
sizeof(size_t), &min_num_sub_groups,
|
||||
sizeof(size_t) * 3, &local_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
|
||||
if (local_size[0] == 0 || local_size[1] != 1 || local_size[2] != 1)
|
||||
{
|
||||
if(min_num_sub_groups == 1)
|
||||
{
|
||||
RETURN_ON_ERROR_MSG(-1, "Can't produce local size with one subgroup")
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
local_size[0] = (std::min)(wg_size, local_size[0]);
|
||||
|
||||
// double-check
|
||||
size_t sub_group_count_for_ndrange;
|
||||
error = clGetKernelSubGroupInfo(kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE,
|
||||
sizeof(size_t) * 3, local_size,
|
||||
sizeof(size_t), &sub_group_count_for_ndrange, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelSubGroupInfo")
|
||||
if (sub_group_count_for_ndrange < min_num_sub_groups)
|
||||
{
|
||||
RETURN_ON_ERROR_MSG(-1,
|
||||
"CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE did not return correct value (expected >=%lu, got %lu)",
|
||||
min_num_sub_groups, sub_group_count_for_ndrange
|
||||
)
|
||||
}
|
||||
|
||||
return local_size[0];
|
||||
#endif
|
||||
}
|
||||
|
||||
cl_int execute(const cl_kernel kernel,
|
||||
const cl_mem output_buffer,
|
||||
const cl_command_queue queue,
|
||||
const size_t work_size,
|
||||
const size_t work_group_size)
|
||||
{
|
||||
cl_int err;
|
||||
// Get context from queue
|
||||
cl_context context;
|
||||
err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, NULL);
|
||||
RETURN_ON_CL_ERROR(err, "clGetCommandQueueInfo")
|
||||
|
||||
// create temp buffer
|
||||
auto temp_buffer = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof(cl_uint) * work_size, NULL, &err);
|
||||
RETURN_ON_CL_ERROR(err, "clCreateBuffer")
|
||||
|
||||
err = clSetKernelArg(kernel, 0, sizeof(output_buffer), &output_buffer);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(temp_buffer), &temp_buffer);
|
||||
err |= clSetKernelArg(kernel, 2, work_group_size * sizeof(cl_uint), NULL);
|
||||
RETURN_ON_CL_ERROR(err, "clSetKernelArg")
|
||||
|
||||
err = clEnqueueNDRangeKernel(
|
||||
queue, kernel, 1,
|
||||
NULL, &work_size, &work_group_size,
|
||||
0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(err, "clEnqueueNDRangeKernel")
|
||||
|
||||
err = clFinish(queue);
|
||||
err |= clReleaseMemObject(temp_buffer);
|
||||
|
||||
return err;
|
||||
}
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------
|
||||
// -------------------------- RUN TESTS -----------------------------------------
|
||||
// ------------------------------------------------------------------------------
|
||||
AUTO_TEST_CASE(test_work_group_named_barrier_spec_example)
|
||||
(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
|
||||
{
|
||||
#if !(defined(DEVELOPMENT) && (defined(USE_OPENCLC_KERNELS) || defined(ONLY_SPIRV_COMPILATION)))
|
||||
if(!is_extension_available(device, "cl_khr_subgroup_named_barrier"))
|
||||
{
|
||||
log_info("SKIPPED: Extension `cl_khr_subgroup_named_barrier` is not supported. Skipping tests.\n");
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
int error = CL_SUCCESS;
|
||||
int last_error = CL_SUCCESS;
|
||||
|
||||
RUN_WG_NAMED_BARRIER_TEST_MACRO(spec_example_work_group_named_barrier_test())
|
||||
|
||||
if(error != CL_SUCCESS)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_NAMED_BARRIER_TEST_SPEC_EXAMPLE_HPP
|
||||
@@ -0,0 +1,342 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_SUB_GROUP_BARRIER_HPP
|
||||
#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_SUB_GROUP_BARRIER_HPP
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
// Common for all OpenCL C++ tests
|
||||
#include "../common.hpp"
|
||||
|
||||
|
||||
namespace test_sub_group_barrier {
|
||||
|
||||
enum class barrier_type
|
||||
{
|
||||
local,
|
||||
global
|
||||
};
|
||||
|
||||
struct test_options
|
||||
{
|
||||
barrier_type barrier;
|
||||
size_t max_count;
|
||||
size_t num_tests;
|
||||
};
|
||||
|
||||
const std::string source_common = R"(
|
||||
// Circular shift of sub-group local ids
|
||||
size_t get_shifted_local_id(int sub_group_local_id_delta)
|
||||
{
|
||||
const int sub_group_size = (int)get_sub_group_size();
|
||||
return (get_local_id(0) - get_sub_group_local_id()) +
|
||||
(((int)get_sub_group_local_id() + sub_group_local_id_delta) % sub_group_size + sub_group_size) % sub_group_size;
|
||||
}
|
||||
|
||||
// Get global ids from shifted local ids
|
||||
size_t get_shifted_global_id(int sub_group_local_id_delta)
|
||||
{
|
||||
return get_group_id(0) * get_enqueued_local_size(0) + get_shifted_local_id(sub_group_local_id_delta);
|
||||
}
|
||||
)";
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
std::string generate_source(test_options options)
|
||||
{
|
||||
std::stringstream s;
|
||||
s << R"(
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
)";
|
||||
s << source_common;
|
||||
if (options.barrier == barrier_type::global)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global long *output)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
|
||||
output[gid] = gid;
|
||||
sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
output[other_gid] += other_gid;
|
||||
sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
output[gid] += gid;
|
||||
sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
else if (options.barrier == barrier_type::local)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global long *output, local long *values)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
const size_t lid = get_shifted_local_id(0);
|
||||
|
||||
values[lid] = gid;
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_lid = get_shifted_local_id(i);
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
values[other_lid] += other_gid;
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
values[lid] += gid;
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
output[gid] = values[lid];
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
return s.str();
|
||||
}
|
||||
#else
|
||||
std::string generate_source(test_options options)
|
||||
{
|
||||
std::stringstream s;
|
||||
s << R"(
|
||||
#include <opencl_memory>
|
||||
#include <opencl_work_item>
|
||||
#include <opencl_synchronization>
|
||||
|
||||
using namespace cl;
|
||||
|
||||
)";
|
||||
s << source_common;
|
||||
|
||||
if (options.barrier == barrier_type::global)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
|
||||
output[gid] = gid;
|
||||
sub_group_barrier(mem_fence::global);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
output[other_gid] += other_gid;
|
||||
sub_group_barrier(mem_fence::global);
|
||||
|
||||
output[gid] += gid;
|
||||
sub_group_barrier(mem_fence::global);
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
else if (options.barrier == barrier_type::local)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output, local_ptr<long[]> values)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
const size_t lid = get_shifted_local_id(0);
|
||||
|
||||
values[lid] = gid;
|
||||
sub_group_barrier(mem_fence::local);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_lid = get_shifted_local_id(i);
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
values[other_lid] += other_gid;
|
||||
sub_group_barrier(mem_fence::local);
|
||||
|
||||
values[lid] += gid;
|
||||
sub_group_barrier(mem_fence::local);
|
||||
}
|
||||
|
||||
output[gid] = values[lid];
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
return s.str();
|
||||
}
|
||||
#endif
|
||||
|
||||
int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
|
||||
{
|
||||
int error = CL_SUCCESS;
|
||||
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
if (!is_extension_available(device, "cl_khr_subgroups"))
|
||||
{
|
||||
log_info("SKIPPED: Extension `cl_khr_subgroups` is not supported. Skipping tests.\n");
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
cl_program program;
|
||||
cl_kernel kernel;
|
||||
|
||||
std::string kernel_name = "test";
|
||||
std::string source = generate_source(options);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
// Only OpenCL C++ to SPIR-V compilation
|
||||
#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
|
||||
error = create_opencl_kernel(
|
||||
context, &program, &kernel,
|
||||
source, kernel_name
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
return error;
|
||||
// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
|
||||
#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
error = create_opencl_kernel(
|
||||
context, &program, &kernel,
|
||||
source, kernel_name, "-cl-std=CL2.0", false
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
// Normal run
|
||||
#else
|
||||
error = create_opencl_kernel(
|
||||
context, &program, &kernel,
|
||||
source, kernel_name
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
#endif
|
||||
|
||||
size_t max_work_group_size;
|
||||
error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_work_group_size), &max_work_group_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
|
||||
|
||||
if (options.barrier == barrier_type::local)
|
||||
{
|
||||
cl_ulong kernel_local_mem_size;
|
||||
error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernel_local_mem_size), &kernel_local_mem_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
|
||||
|
||||
cl_ulong device_local_mem_size;
|
||||
error = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(device_local_mem_size), &device_local_mem_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
|
||||
|
||||
max_work_group_size = (std::min)(max_work_group_size, (device_local_mem_size - kernel_local_mem_size) / sizeof(cl_long));
|
||||
}
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<size_t> global_size_dis(1, options.max_count);
|
||||
std::uniform_int_distribution<size_t> local_size_dis(1, max_work_group_size);
|
||||
std::uniform_int_distribution<int> iter_dis(0, 20);
|
||||
|
||||
for (size_t test = 0; test < options.num_tests; test++)
|
||||
{
|
||||
const size_t global_size = global_size_dis(gen);
|
||||
const size_t local_size = local_size_dis(gen);
|
||||
const size_t count = global_size;
|
||||
|
||||
const int iter_lo = -iter_dis(gen);
|
||||
const int iter_hi = +iter_dis(gen);
|
||||
|
||||
cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_long) * count, NULL, &error);
|
||||
RETURN_ON_CL_ERROR(error, "clCreateBuffer")
|
||||
|
||||
error = clSetKernelArg(kernel, 0, sizeof(iter_lo), &iter_lo);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
error = clSetKernelArg(kernel, 1, sizeof(iter_hi), &iter_hi);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
error = clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
if (options.barrier == barrier_type::local)
|
||||
{
|
||||
error = clSetKernelArg(kernel, 3, sizeof(cl_long) * local_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
}
|
||||
|
||||
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
|
||||
|
||||
std::vector<cl_long> output(count);
|
||||
error = clEnqueueReadBuffer(
|
||||
queue, output_buffer, CL_TRUE,
|
||||
0, sizeof(cl_long) * count,
|
||||
static_cast<void *>(output.data()),
|
||||
0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
|
||||
|
||||
error = clReleaseMemObject(output_buffer);
|
||||
RETURN_ON_CL_ERROR(error, "clReleaseMemObject")
|
||||
|
||||
for (size_t gid = 0; gid < count; gid++)
|
||||
{
|
||||
const long value = output[gid];
|
||||
const long expected = gid + 2 * gid * (iter_hi - iter_lo);
|
||||
|
||||
if (value != expected)
|
||||
{
|
||||
RETURN_ON_ERROR_MSG(-1,
|
||||
"Element %lu has incorrect value. Expected: %ld, got: %ld",
|
||||
gid, expected, value
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
return error;
|
||||
}
|
||||
|
||||
AUTO_TEST_CASE(test_sub_group_barrier_global)
|
||||
(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
|
||||
{
|
||||
test_options options;
|
||||
options.barrier = barrier_type::global;
|
||||
options.num_tests = 1000;
|
||||
options.max_count = num_elements;
|
||||
return test(device, context, queue, options);
|
||||
}
|
||||
|
||||
AUTO_TEST_CASE(test_sub_group_barrier_local)
|
||||
(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
|
||||
{
|
||||
test_options options;
|
||||
options.barrier = barrier_type::local;
|
||||
options.num_tests = 1000;
|
||||
options.max_count = num_elements;
|
||||
return test(device, context, queue, options);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_SUB_GROUP_BARRIER_HPP
|
||||
@@ -0,0 +1,330 @@
|
||||
//
|
||||
// Copyright (c) 2017 The Khronos Group Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
#ifndef TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_WORK_GROUP_BARRIER_HPP
|
||||
#define TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_WORK_GROUP_BARRIER_HPP
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
// Common for all OpenCL C++ tests
|
||||
#include "../common.hpp"
|
||||
|
||||
|
||||
namespace test_work_group_barrier {
|
||||
|
||||
enum class barrier_type
|
||||
{
|
||||
local,
|
||||
global
|
||||
};
|
||||
|
||||
struct test_options
|
||||
{
|
||||
barrier_type barrier;
|
||||
size_t max_count;
|
||||
size_t num_tests;
|
||||
};
|
||||
|
||||
const std::string source_common = R"(
|
||||
// Circular shift of local ids
|
||||
size_t get_shifted_local_id(int local_id_delta)
|
||||
{
|
||||
const int local_size = (int)get_local_size(0);
|
||||
return (((int)get_local_id(0) + local_id_delta) % local_size + local_size) % local_size;
|
||||
}
|
||||
|
||||
// Get global ids from shifted local ids
|
||||
size_t get_shifted_global_id(int local_id_delta)
|
||||
{
|
||||
return get_group_id(0) * get_enqueued_local_size(0) + get_shifted_local_id(local_id_delta);
|
||||
}
|
||||
)";
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
#if defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
std::string generate_source(test_options options)
|
||||
{
|
||||
std::stringstream s;
|
||||
s << source_common;
|
||||
if (options.barrier == barrier_type::global)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global long *output)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
|
||||
output[gid] = gid;
|
||||
work_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
output[other_gid] += other_gid;
|
||||
work_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
output[gid] += gid;
|
||||
work_group_barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
else if (options.barrier == barrier_type::local)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global long *output, local long *values)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
const size_t lid = get_shifted_local_id(0);
|
||||
|
||||
values[lid] = gid;
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_lid = get_shifted_local_id(i);
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
values[other_lid] += other_gid;
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
values[lid] += gid;
|
||||
work_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
output[gid] = values[lid];
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
return s.str();
|
||||
}
|
||||
#else
|
||||
std::string generate_source(test_options options)
|
||||
{
|
||||
std::stringstream s;
|
||||
s << R"(
|
||||
#include <opencl_memory>
|
||||
#include <opencl_work_item>
|
||||
#include <opencl_synchronization>
|
||||
|
||||
using namespace cl;
|
||||
|
||||
)";
|
||||
s << source_common;
|
||||
|
||||
if (options.barrier == barrier_type::global)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
|
||||
output[gid] = gid;
|
||||
work_group_barrier(mem_fence::global);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
output[other_gid] += other_gid;
|
||||
work_group_barrier(mem_fence::global);
|
||||
|
||||
output[gid] += gid;
|
||||
work_group_barrier(mem_fence::global);
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
else if (options.barrier == barrier_type::local)
|
||||
{
|
||||
s << R"(
|
||||
kernel void test(const int iter_lo, const int iter_hi, global_ptr<long[]> output, local_ptr<long[]> values)
|
||||
{
|
||||
const size_t gid = get_shifted_global_id(0);
|
||||
const size_t lid = get_shifted_local_id(0);
|
||||
|
||||
values[lid] = gid;
|
||||
work_group_barrier(mem_fence::local);
|
||||
|
||||
for (int i = iter_lo; i < iter_hi; i++)
|
||||
{
|
||||
const size_t other_lid = get_shifted_local_id(i);
|
||||
const size_t other_gid = get_shifted_global_id(i);
|
||||
|
||||
values[other_lid] += other_gid;
|
||||
work_group_barrier(mem_fence::local);
|
||||
|
||||
values[lid] += gid;
|
||||
work_group_barrier(mem_fence::local);
|
||||
}
|
||||
|
||||
output[gid] = values[lid];
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
return s.str();
|
||||
}
|
||||
#endif
|
||||
|
||||
int test(cl_device_id device, cl_context context, cl_command_queue queue, test_options options)
|
||||
{
|
||||
int error = CL_SUCCESS;
|
||||
|
||||
cl_program program;
|
||||
cl_kernel kernel;
|
||||
|
||||
std::string kernel_name = "test";
|
||||
std::string source = generate_source(options);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
// ------------- ONLY FOR OPENCL 22 CONFORMANCE TEST 22 DEVELOPMENT ------------------
|
||||
// -----------------------------------------------------------------------------------
|
||||
// Only OpenCL C++ to SPIR-V compilation
|
||||
#if defined(DEVELOPMENT) && defined(ONLY_SPIRV_COMPILATION)
|
||||
error = create_opencl_kernel(
|
||||
context, &program, &kernel,
|
||||
source, kernel_name
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
return error;
|
||||
// Use OpenCL C kernels instead of OpenCL C++ kernels (test C++ host code)
|
||||
#elif defined(DEVELOPMENT) && defined(USE_OPENCLC_KERNELS)
|
||||
error = create_opencl_kernel(
|
||||
context, &program, &kernel,
|
||||
source, kernel_name, "-cl-std=CL2.0", false
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
// Normal run
|
||||
#else
|
||||
error = create_opencl_kernel(
|
||||
context, &program, &kernel,
|
||||
source, kernel_name
|
||||
);
|
||||
RETURN_ON_ERROR(error)
|
||||
#endif
|
||||
|
||||
size_t max_work_group_size;
|
||||
error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_work_group_size), &max_work_group_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
|
||||
|
||||
if (options.barrier == barrier_type::local)
|
||||
{
|
||||
cl_ulong kernel_local_mem_size;
|
||||
error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernel_local_mem_size), &kernel_local_mem_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetKernelWorkGroupInfo")
|
||||
|
||||
cl_ulong device_local_mem_size;
|
||||
error = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(device_local_mem_size), &device_local_mem_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clGetDeviceInfo")
|
||||
|
||||
max_work_group_size = (std::min)(max_work_group_size, (device_local_mem_size - kernel_local_mem_size) / sizeof(cl_long));
|
||||
}
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<size_t> global_size_dis(1, options.max_count);
|
||||
std::uniform_int_distribution<size_t> local_size_dis(1, max_work_group_size);
|
||||
std::uniform_int_distribution<int> iter_dis(0, 20);
|
||||
|
||||
for (size_t test = 0; test < options.num_tests; test++)
|
||||
{
|
||||
const size_t global_size = global_size_dis(gen);
|
||||
const size_t local_size = local_size_dis(gen);
|
||||
const size_t count = global_size;
|
||||
|
||||
const int iter_lo = -iter_dis(gen);
|
||||
const int iter_hi = +iter_dis(gen);
|
||||
|
||||
cl_mem output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_long) * count, NULL, &error);
|
||||
RETURN_ON_CL_ERROR(error, "clCreateBuffer")
|
||||
|
||||
error = clSetKernelArg(kernel, 0, sizeof(iter_lo), &iter_lo);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
error = clSetKernelArg(kernel, 1, sizeof(iter_hi), &iter_hi);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
error = clSetKernelArg(kernel, 2, sizeof(output_buffer), &output_buffer);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
if (options.barrier == barrier_type::local)
|
||||
{
|
||||
error = clSetKernelArg(kernel, 3, sizeof(cl_long) * local_size, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clSetKernelArg")
|
||||
}
|
||||
|
||||
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
|
||||
RETURN_ON_CL_ERROR(error, "clEnqueueNDRangeKernel")
|
||||
|
||||
std::vector<cl_long> output(count);
|
||||
error = clEnqueueReadBuffer(
|
||||
queue, output_buffer, CL_TRUE,
|
||||
0, sizeof(cl_long) * count,
|
||||
static_cast<void *>(output.data()),
|
||||
0, NULL, NULL
|
||||
);
|
||||
RETURN_ON_CL_ERROR(error, "clEnqueueReadBuffer")
|
||||
|
||||
error = clReleaseMemObject(output_buffer);
|
||||
RETURN_ON_CL_ERROR(error, "clReleaseMemObject")
|
||||
|
||||
for (size_t gid = 0; gid < count; gid++)
|
||||
{
|
||||
const long value = output[gid];
|
||||
const long expected = gid + 2 * gid * (iter_hi - iter_lo);
|
||||
|
||||
if (value != expected)
|
||||
{
|
||||
RETURN_ON_ERROR_MSG(-1,
|
||||
"Element %lu has incorrect value. Expected: %ld, got: %ld",
|
||||
gid, expected, value
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
return error;
|
||||
}
|
||||
|
||||
AUTO_TEST_CASE(test_work_group_barrier_global)
|
||||
(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
|
||||
{
|
||||
test_options options;
|
||||
options.barrier = barrier_type::global;
|
||||
options.num_tests = 1000;
|
||||
options.max_count = num_elements;
|
||||
return test(device, context, queue, options);
|
||||
}
|
||||
|
||||
AUTO_TEST_CASE(test_work_group_barrier_local)
|
||||
(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
|
||||
{
|
||||
test_options options;
|
||||
options.barrier = barrier_type::local;
|
||||
options.num_tests = 1000;
|
||||
options.max_count = num_elements;
|
||||
return test(device, context, queue, options);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // TEST_CONFORMANCE_CLCPP_SYNCHRONIZATION_TEST_WORK_GROUP_BARRIER_HPP
|
||||
Reference in New Issue
Block a user