From 6b36f645b835f8054422cad3f050d18935be11be Mon Sep 17 00:00:00 2001 From: Zakaria Taha <45341452+zakaria6868@users.noreply.github.com> Date: Thu, 18 Mar 2021 16:27:59 +0200 Subject: [PATCH] Add tests to proposed new builtin async_copy functions with a bug fix. (#725) * Add tests to proposed new builtin async_copy functions with a bug fix. * Revert "Add tests to proposed new builtin async_copy functions with a bug fix." This reverts commit 7d0f16d014d228c327daf27464b27e02267f9aef. * Add tests to proposed new builtin async_copy functions. * Added is_extension_available to check if an extension is available. * Added is extension available for test_async_copy_fence. * fix build issues on windows. * include algorithms.h for async copy 2D/3D. * adding algorithms header. * Fix numLines - 1 in maxTotalPlanesIn/Out. * fix formatting violations. * fixed formatting issue. --- test_conformance/basic/CMakeLists.txt | 3 + test_conformance/basic/main.cpp | 14 +- test_conformance/basic/procs.h | 36 + test_conformance/basic/test_async_copy2D.cpp | 449 ++++++++++ test_conformance/basic/test_async_copy3D.cpp | 546 ++++++++++++ .../basic/test_async_copy_fence.cpp | 812 ++++++++++++++++++ 6 files changed, 1858 insertions(+), 2 deletions(-) create mode 100644 test_conformance/basic/test_async_copy2D.cpp create mode 100644 test_conformance/basic/test_async_copy3D.cpp create mode 100644 test_conformance/basic/test_async_copy_fence.cpp diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt index 27178246..c5c4b5f0 100644 --- a/test_conformance/basic/CMakeLists.txt +++ b/test_conformance/basic/CMakeLists.txt @@ -37,6 +37,9 @@ set(${MODULE_NAME}_SOURCES test_work_item_functions.cpp test_astype.cpp test_async_copy.cpp + test_async_copy2D.cpp + test_async_copy3D.cpp + test_async_copy_fence.cpp test_sizeof.cpp test_vector_creation.cpp test_vector_swizzle.cpp diff --git a/test_conformance/basic/main.cpp b/test_conformance/basic/main.cpp index d1a35fae..86c3cec3 100644 --- a/test_conformance/basic/main.cpp +++ b/test_conformance/basic/main.cpp @@ -113,14 +113,24 @@ test_definition test_list[] = { ADD_TEST(async_copy_local_to_global), ADD_TEST(async_strided_copy_global_to_local), ADD_TEST(async_strided_copy_local_to_global), + ADD_TEST(async_copy_global_to_local2D), + ADD_TEST(async_copy_local_to_global2D), + ADD_TEST(async_copy_global_to_local3D), + ADD_TEST(async_copy_local_to_global3D), + ADD_TEST(async_work_group_copy_fence_import_after_export_aliased_local), + ADD_TEST(async_work_group_copy_fence_import_after_export_aliased_global), + ADD_TEST( + async_work_group_copy_fence_import_after_export_aliased_global_and_local), + ADD_TEST(async_work_group_copy_fence_export_after_import_aliased_local), + ADD_TEST(async_work_group_copy_fence_export_after_import_aliased_global), + ADD_TEST( + async_work_group_copy_fence_export_after_import_aliased_global_and_local), ADD_TEST(prefetch), - ADD_TEST(kernel_call_kernel_function), ADD_TEST(host_numeric_constants), ADD_TEST(kernel_numeric_constants), ADD_TEST(kernel_limit_constants), ADD_TEST(kernel_preprocessor_macros), - ADD_TEST(parameter_types), ADD_TEST(vector_creation), ADD_TEST(vector_swizzle), diff --git a/test_conformance/basic/procs.h b/test_conformance/basic/procs.h index bdb7d6a4..4a01a8cb 100644 --- a/test_conformance/basic/procs.h +++ b/test_conformance/basic/procs.h @@ -115,6 +115,42 @@ extern int test_async_copy_global_to_local(cl_device_id deviceID, cl_contex extern int test_async_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); extern int test_async_strided_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); extern int test_async_strided_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); +extern int test_async_copy_global_to_local2D(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_async_copy_local_to_global2D(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_async_copy_global_to_local3D(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_async_copy_local_to_global3D(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_async_work_group_copy_fence_import_after_export_aliased_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_async_work_group_copy_fence_import_after_export_aliased_global( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int +test_async_work_group_copy_fence_import_after_export_aliased_global_and_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_async_work_group_copy_fence_export_after_import_aliased_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_async_work_group_copy_fence_export_after_import_aliased_global( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int +test_async_work_group_copy_fence_export_after_import_aliased_global_and_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); extern int test_prefetch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); extern int test_host_numeric_constants(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp new file mode 100644 index 00000000..2b534497 --- /dev/null +++ b/test_conformance/basic/test_async_copy2D.cpp @@ -0,0 +1,449 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "../../test_common/harness/compat.h" + +#include +#include +#include +#include +#include +#include + +#include "../../test_common/harness/conversions.h" +#include "procs.h" + +static const char *async_global_to_local_kernel2D = + "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" + "%s\n" // optional pragma string + "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " + "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int " + "lineCopiesPerWorkItem, int srcStride, int dstStride )\n" + "{\n" + " int i, j;\n" + // Zero the local storage first + " for(i=0; i max_global_mem_size / 2) + max_alloc_size = max_global_mem_size / 2; + + unsigned int num_of_compute_devices; + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(num_of_compute_devices), + &num_of_compute_devices, NULL); + test_error(error, + "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed."); + + char programSource[4096]; + programSource[0] = 0; + char *programPtr; + + sprintf(programSource, kernelCode, + vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" + : "", + vecNameString, vecNameString, vecNameString, vecNameString, + get_explicit_type_name(vecType), vecNameString, vecNameString); + // log_info("program: %s\n", programSource); + programPtr = programSource; + + error = create_single_kernel_helper(context, &program, &kernel, 1, + (const char **)&programPtr, "test_fn"); + test_error(error, "Unable to create testing kernel"); + + size_t max_workgroup_size; + error = clGetKernelWorkGroupInfo( + kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), + &max_workgroup_size, NULL); + test_error( + error, + "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE."); + + size_t max_local_workgroup_size[3]; + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, + sizeof(max_local_workgroup_size), + max_local_workgroup_size, NULL); + test_error(error, + "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES"); + + // Pick the minimum of the device and the kernel + if (max_workgroup_size > max_local_workgroup_size[0]) + max_workgroup_size = max_local_workgroup_size[0]; + + size_t numElementsPerLine = 10; + size_t lineCopiesPerWorkItem = 13; + elementSize = + get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); + size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize + * (numElementsPerLine + (localIsDst ? dstStride : srcStride)); + size_t maxLocalWorkgroupSize = + (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); + + // Calculation can return 0 on embedded devices due to 1KB local mem limit + if (maxLocalWorkgroupSize == 0) + { + maxLocalWorkgroupSize = 1; + } + + size_t localWorkgroupSize = maxLocalWorkgroupSize; + if (maxLocalWorkgroupSize > max_workgroup_size) + localWorkgroupSize = max_workgroup_size; + + size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride) + / (numElementsPerLine + srcStride); + size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride) + / (numElementsPerLine + dstStride); + size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut); + size_t maxLocalWorkgroups = + maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem); + + size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstStride : srcStride); + size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups); + size_t totalLines = + numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem; + size_t inBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride); + size_t outBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride); + size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + + inBuffer = (void *)malloc(inBufferSize); + outBuffer = (void *)malloc(outBufferSize); + outBufferCopy = (void *)malloc(outBufferSize); + + cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt, + lineCopiesPerWorkgroup; + lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem; + numElementsPerLineInt = (int)numElementsPerLine; + lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize); + + log_info( + "Global: %d, local %d, local buffer %db, global in buffer %db, " + "global out buffer %db, each work group will copy %d lines and each " + "work item item will copy %d lines.\n", + (int)globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize, + (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup, + lineCopiesPerWorkItemInt); + + threads[0] = globalWorkgroupSize; + localThreads[0] = localWorkgroupSize; + + d = init_genrand(gRandomSeed); + generate_random_data( + vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer); + generate_random_data( + vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer); + free_mtdata(d); + d = NULL; + memcpy(outBufferCopy, outBuffer, outBufferSize); + + streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize, + inBuffer, &error); + test_error(error, "Unable to create input buffer"); + streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize, + outBuffer, &error); + test_error(error, "Unable to create output buffer"); + + error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 2, localBufferSize, NULL); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt), + &numElementsPerLineInt); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 4, sizeof(lineCopiesPerWorkgroup), + &lineCopiesPerWorkgroup); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 5, sizeof(lineCopiesPerWorkItemInt), + &lineCopiesPerWorkItemInt); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 6, sizeof(srcStride), &srcStride); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 7, sizeof(dstStride), &dstStride); + test_error(error, "Unable to set kernel argument"); + + // Enqueue + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, NULL); + test_error(error, "Unable to queue kernel"); + + // Read + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize, + outBuffer, 0, NULL, NULL); + test_error(error, "Unable to read results"); + + // Verify + int failuresPrinted = 0; + // Verify + size_t typeSize = get_explicit_type_size(vecType) * vecSize; + for (int i = 0; + i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize; + i += elementSize) + { + for (int j = 0; j < (int)numElementsPerLine * elementSize; + j += elementSize) + { + int inIdx = i * (numElementsPerLine + srcStride) + j; + int outIdx = i * (numElementsPerLine + dstStride) + j; + if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, + typeSize) + != 0) + { + unsigned char *inchar = (unsigned char *)inBuffer + inIdx; + unsigned char *outchar = (unsigned char *)outBuffer + outIdx; + char values[4096]; + values[0] = 0; + + if (failuresPrinted == 0) + { + // Print first failure message + log_error("ERROR: Results of copy did not validate!\n"); + } + sprintf(values + strlen(values), "%d -> [", inIdx); + for (int k = 0; k < (int)elementSize; k++) + sprintf(values + strlen(values), "%2x ", inchar[k]); + sprintf(values + strlen(values), "] != ["); + for (int k = 0; k < (int)elementSize; k++) + sprintf(values + strlen(values), "%2x ", outchar[k]); + sprintf(values + strlen(values), "]"); + log_error("%s\n", values); + failuresPrinted++; + } + + if (failuresPrinted > 5) + { + log_error("Not printing further failures...\n"); + return -1; + } + } + if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1) + * elementSize) + { + int outIdx = i * (numElementsPerLine + dstStride) + + numElementsPerLine * elementSize; + if (memcmp(((char *)outBuffer) + outIdx, + ((char *)outBufferCopy) + outIdx, + dstStride * elementSize) + != 0) + { + if (failuresPrinted == 0) + { + // Print first failure message + log_error("ERROR: Results of copy did not validate!\n"); + } + log_error( + "2D copy corrupted data in output buffer in the stride " + "offset of line %d\n", + i); + failuresPrinted++; + } + if (failuresPrinted > 5) + { + log_error("Not printing further failures...\n"); + return -1; + } + } + } + + free(inBuffer); + free(outBuffer); + free(outBufferCopy); + + return failuresPrinted ? -1 : 0; +} + +int test_copy2D_all_types(cl_device_id deviceID, cl_context context, + cl_command_queue queue, const char *kernelCode, + bool localIsDst) +{ + ExplicitType vecType[] = { + kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, + kULong, kFloat, kDouble, kNumExplicitTypes + }; + unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; + unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcStride, dstStride; + + int errors = 0; + + for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++) + { + if (vecType[typeIndex] == kDouble + && !is_extension_available(deviceID, "cl_khr_fp64")) + continue; + + if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong) + && !gHasLong) + continue; + + for (size = 0; vecSizes[size] != 0; size++) + { + if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] + <= 2) // small type + { + for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes) + / sizeof(smallTypesStrideSizes[0]); + srcStride++) + { + for (dstStride = 0; + dstStride < sizeof(smallTypesStrideSizes) + / sizeof(smallTypesStrideSizes[0]); + dstStride++) + { + if (test_copy2D(deviceID, context, queue, kernelCode, + vecType[typeIndex], vecSizes[size], + smallTypesStrideSizes[srcStride], + smallTypesStrideSizes[dstStride], + localIsDst)) + { + errors++; + } + } + } + } + // not a small type, check only zero stride + else if (test_copy2D(deviceID, context, queue, kernelCode, + vecType[typeIndex], vecSizes[size], 0, 0, + localIsDst)) + { + errors++; + } + } + } + if (errors) return -1; + return 0; +} + +int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + return test_copy2D_all_types(deviceID, context, queue, + async_global_to_local_kernel2D, true); +} + +int test_async_copy_local_to_global2D(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + return test_copy2D_all_types(deviceID, context, queue, + async_local_to_global_kernel2D, false); +} diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp new file mode 100644 index 00000000..af10191f --- /dev/null +++ b/test_conformance/basic/test_async_copy3D.cpp @@ -0,0 +1,546 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "../../test_common/harness/compat.h" + +#include +#include +#include +#include +#include +#include + +#include "../../test_common/harness/conversions.h" +#include "procs.h" + +static const char *async_global_to_local_kernel3D = + "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" + "%s\n" // optional pragma string + "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " + "%s *localBuffer, int numElementsPerLine, int numLines, int " + "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, " + "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n" + "{\n" + " int i, j, k;\n" + // Zero the local storage first + " for(i=0; i max_global_mem_size / 2) + max_alloc_size = max_global_mem_size / 2; + + unsigned int num_of_compute_devices; + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(num_of_compute_devices), + &num_of_compute_devices, NULL); + test_error(error, + "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed."); + + char programSource[4096]; + programSource[0] = 0; + char *programPtr; + + sprintf(programSource, kernelCode, + vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" + : "", + vecNameString, vecNameString, vecNameString, vecNameString, + get_explicit_type_name(vecType), vecNameString, vecNameString); + // log_info("program: %s\n", programSource); + programPtr = programSource; + + error = create_single_kernel_helper(context, &program, &kernel, 1, + (const char **)&programPtr, "test_fn"); + test_error(error, "Unable to create testing kernel"); + + size_t max_workgroup_size; + error = clGetKernelWorkGroupInfo( + kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), + &max_workgroup_size, NULL); + test_error( + error, + "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE."); + + size_t max_local_workgroup_size[3]; + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, + sizeof(max_local_workgroup_size), + max_local_workgroup_size, NULL); + test_error(error, + "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES"); + + // Pick the minimum of the device and the kernel + if (max_workgroup_size > max_local_workgroup_size[0]) + max_workgroup_size = max_local_workgroup_size[0]; + + size_t numElementsPerLine = 10; + size_t numLines = 13; + size_t planesCopiesPerWorkItem = 2; + elementSize = + get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); + size_t localStorageSpacePerWorkitem = elementSize + * (planesCopiesPerWorkItem + * (numLines * numElementsPerLine + + numLines * (localIsDst ? dstLineStride : srcLineStride) + + (localIsDst ? dstPlaneStride : srcPlaneStride))); + size_t maxLocalWorkgroupSize = + (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); + + // Calculation can return 0 on embedded devices due to 1KB local mem limit + if (maxLocalWorkgroupSize == 0) + { + maxLocalWorkgroupSize = 1; + } + + size_t localWorkgroupSize = maxLocalWorkgroupSize; + if (maxLocalWorkgroupSize > max_workgroup_size) + localWorkgroupSize = max_workgroup_size; + + size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride) + / ((numLines * numElementsPerLine + numLines * srcLineStride) + + srcPlaneStride); + size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride) + / ((numLines * numElementsPerLine + numLines * dstLineStride) + + dstPlaneStride); + size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut); + size_t maxLocalWorkgroups = + maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem); + + size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstPlaneStride : srcPlaneStride); + size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups); + size_t totalPlanes = + numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem; + size_t inBufferSize = elementSize + * (totalPlanes + * (numLines * numElementsPerLine + numLines * srcLineStride) + + (totalPlanes - 1) * srcPlaneStride); + size_t outBufferSize = elementSize + * (totalPlanes + * (numLines * numElementsPerLine + numLines * dstLineStride) + + (totalPlanes - 1) * dstPlaneStride); + size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + + inBuffer = (void *)malloc(inBufferSize); + outBuffer = (void *)malloc(outBufferSize); + outBufferCopy = (void *)malloc(outBufferSize); + + cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt, + planesCopiesPerWorkgroup; + planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem; + numElementsPerLineInt = (int)numElementsPerLine; + numLinesInt = (int)numLines; + planesCopiesPerWorkgroup = + (int)(planesCopiesPerWorkItem * localWorkgroupSize); + + log_info("Global: %d, local %d, local buffer %db, global in buffer %db, " + "global out buffer %db, each work group will copy %d planes and " + "each work item item will copy %d planes.\n", + (int)globalWorkgroupSize, (int)localWorkgroupSize, + (int)localBufferSize, (int)inBufferSize, (int)outBufferSize, + planesCopiesPerWorkgroup, planesCopiesPerWorkItemInt); + + threads[0] = globalWorkgroupSize; + localThreads[0] = localWorkgroupSize; + + d = init_genrand(gRandomSeed); + generate_random_data( + vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer); + generate_random_data( + vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer); + free_mtdata(d); + d = NULL; + memcpy(outBufferCopy, outBuffer, outBufferSize); + + streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize, + inBuffer, &error); + test_error(error, "Unable to create input buffer"); + streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize, + outBuffer, &error); + test_error(error, "Unable to create output buffer"); + + error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 2, localBufferSize, NULL); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt), + &numElementsPerLineInt); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 4, sizeof(numLinesInt), &numLinesInt); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 5, sizeof(planesCopiesPerWorkgroup), + &planesCopiesPerWorkgroup); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 6, sizeof(planesCopiesPerWorkItemInt), + &planesCopiesPerWorkItemInt); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 7, sizeof(srcLineStride), &srcLineStride); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 8, sizeof(dstLineStride), &dstLineStride); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 9, sizeof(srcPlaneStride), &srcPlaneStride); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, 10, sizeof(dstPlaneStride), &dstPlaneStride); + test_error(error, "Unable to set kernel argument"); + + // Enqueue + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, NULL); + test_error(error, "Unable to queue kernel"); + + // Read + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize, + outBuffer, 0, NULL, NULL); + test_error(error, "Unable to read results"); + + // Verify + int failuresPrinted = 0; + // Verify + size_t typeSize = get_explicit_type_size(vecType) * vecSize; + for (int i = 0; + i < (int)globalWorkgroupSize * planesCopiesPerWorkItem * elementSize; + i += elementSize) + { + for (int j = 0; j < (int)numLines * elementSize; j += elementSize) + { + for (int k = 0; k < (int)numElementsPerLine * elementSize; + k += elementSize) + { + int inIdx = i + * (numLines * numElementsPerLine + + numLines * srcLineStride + srcPlaneStride) + + j * (numElementsPerLine + srcLineStride) + k; + int outIdx = i + * (numLines * numElementsPerLine + + numLines * dstLineStride + dstPlaneStride) + + j * (numElementsPerLine + dstLineStride) + k; + if (memcmp(((char *)inBuffer) + inIdx, + ((char *)outBuffer) + outIdx, typeSize) + != 0) + { + unsigned char *inchar = (unsigned char *)inBuffer + inIdx; + unsigned char *outchar = + (unsigned char *)outBuffer + outIdx; + char values[4096]; + values[0] = 0; + + if (failuresPrinted == 0) + { + // Print first failure message + log_error("ERROR: Results of copy did not validate!"); + } + sprintf(values + strlen(values), "%d -> [", inIdx); + for (int l = 0; l < (int)elementSize; l++) + sprintf(values + strlen(values), "%2x ", inchar[l]); + sprintf(values + strlen(values), "] != ["); + for (int l = 0; l < (int)elementSize; l++) + sprintf(values + strlen(values), "%2x ", outchar[l]); + sprintf(values + strlen(values), "]"); + log_error("%s\n", values); + failuresPrinted++; + } + + if (failuresPrinted > 5) + { + log_error("Not printing further failures...\n"); + return -1; + } + } + if (j < (int)numLines * elementSize) + { + int outIdx = i + * (numLines * numElementsPerLine + + numLines * dstLineStride + dstPlaneStride) + + j * (numElementsPerLine + dstLineStride) + + numElementsPerLine * elementSize; + if (memcmp(((char *)outBuffer) + outIdx, + ((char *)outBufferCopy) + outIdx, + dstLineStride * elementSize) + != 0) + { + if (failuresPrinted == 0) + { + // Print first failure message + log_error("ERROR: Results of copy did not validate!\n"); + } + log_error( + "3D copy corrupted data in output buffer in the line " + "stride offset of plane %d line %d\n", + i, j); + failuresPrinted++; + } + if (failuresPrinted > 5) + { + log_error("Not printing further failures...\n"); + return -1; + } + } + } + if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1) + * elementSize) + { + int outIdx = i + * (numLines * numElementsPerLine + numLines * dstLineStride + + dstPlaneStride) + + (numLines * elementSize) * (numElementsPerLine) + + (numLines * elementSize) * (dstLineStride); + if (memcmp(((char *)outBuffer) + outIdx, + ((char *)outBufferCopy) + outIdx, + dstPlaneStride * elementSize) + != 0) + { + if (failuresPrinted == 0) + { + // Print first failure message + log_error("ERROR: Results of copy did not validate!\n"); + } + log_error("3D copy corrupted data in output buffer in the " + "plane stride " + "offset of plane %d\n", + i); + failuresPrinted++; + } + if (failuresPrinted > 5) + { + log_error("Not printing further failures...\n"); + return -1; + } + } + } + + free(inBuffer); + free(outBuffer); + free(outBufferCopy); + + return failuresPrinted ? -1 : 0; +} + +int test_copy3D_all_types(cl_device_id deviceID, cl_context context, + cl_command_queue queue, const char *kernelCode, + bool localIsDst) +{ + ExplicitType vecType[] = { + kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, + kULong, kFloat, kDouble, kNumExplicitTypes + }; + unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; + unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride, + dstPlaneStride; + + int errors = 0; + + for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++) + { + if (vecType[typeIndex] == kDouble + && !is_extension_available(deviceID, "cl_khr_fp64")) + continue; + + if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong) + && !gHasLong) + continue; + + for (size = 0; vecSizes[size] != 0; size++) + { + if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] + <= 2) // small type + { + for (srcLineStride = 0; + srcLineStride < sizeof(smallTypesStrideSizes) + / sizeof(smallTypesStrideSizes[0]); + srcLineStride++) + { + for (dstLineStride = 0; + dstLineStride < sizeof(smallTypesStrideSizes) + / sizeof(smallTypesStrideSizes[0]); + dstLineStride++) + { + for (srcPlaneStride = 0; + srcPlaneStride < sizeof(smallTypesStrideSizes) + / sizeof(smallTypesStrideSizes[0]); + srcPlaneStride++) + { + for (dstPlaneStride = 0; + dstPlaneStride < sizeof(smallTypesStrideSizes) + / sizeof(smallTypesStrideSizes[0]); + dstPlaneStride++) + { + if (test_copy3D( + deviceID, context, queue, kernelCode, + vecType[typeIndex], vecSizes[size], + smallTypesStrideSizes[srcLineStride], + smallTypesStrideSizes[dstLineStride], + smallTypesStrideSizes[srcPlaneStride], + smallTypesStrideSizes[dstPlaneStride], + localIsDst)) + { + errors++; + } + } + } + } + } + } + // not a small type, check only zero stride + else if (test_copy3D(deviceID, context, queue, kernelCode, + vecType[typeIndex], vecSizes[size], 0, 0, 0, 0, + localIsDst)) + { + errors++; + } + } + } + if (errors) return -1; + return 0; +} + +int test_async_copy_global_to_local3D(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + return test_copy3D_all_types(deviceID, context, queue, + async_global_to_local_kernel3D, true); +} + +int test_async_copy_local_to_global3D(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + return test_copy3D_all_types(deviceID, context, queue, + async_local_to_global_kernel3D, false); +} diff --git a/test_conformance/basic/test_async_copy_fence.cpp b/test_conformance/basic/test_async_copy_fence.cpp new file mode 100644 index 00000000..74f6e407 --- /dev/null +++ b/test_conformance/basic/test_async_copy_fence.cpp @@ -0,0 +1,812 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "../../test_common/harness/compat.h" + +#include +#include +#include +#include +#include + +#include "../../test_common/harness/conversions.h" +#include "procs.h" + +static const char *import_after_export_aliased_local_kernel = + "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n" + "%s\n" // optional pragma string + "__kernel void test_fn( const __global %s *exportSrc, __global %s " + "*exportDst,\n" + " const __global %s *importSrc, __global %s " + "*importDst,\n" + " __local %s *localBuffer, /* there isn't another " + "__local %s local buffer since export src and import dst are aliased*/\n" + " int exportSrcLocalSize, int " + "exportCopiesPerWorkItem,\n" + " int importSrcLocalSize, int " + "importCopiesPerWorkItem )\n" + "{\n" + " int i;\n" + " int localImportOffset = exportSrcLocalSize - importSrcLocalSize;\n" + // Zero the local storage first + " for(i=0; i max_local_workgroup_size[0]) + max_workgroup_size = max_local_workgroup_size[0]; + + size_t transaction1NumberOfCopiesPerWorkitem = 13; + size_t transaction2NumberOfCopiesPerWorkitem = 2; + elementSize = + get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); + size_t localStorageSpacePerWorkitem = + transaction1NumberOfCopiesPerWorkitem * elementSize + + (aliased_local_mem + ? 0 + : transaction2NumberOfCopiesPerWorkitem * elementSize); + size_t maxLocalWorkgroupSize = + (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); + + // Calculation can return 0 on embedded devices due to 1KB local mem limit + if (maxLocalWorkgroupSize == 0) + { + maxLocalWorkgroupSize = 1; + } + + size_t localWorkgroupSize = maxLocalWorkgroupSize; + if (maxLocalWorkgroupSize > max_workgroup_size) + localWorkgroupSize = max_workgroup_size; + + size_t transaction1LocalBufferSize = localWorkgroupSize * elementSize + * transaction1NumberOfCopiesPerWorkitem; + size_t transaction2LocalBufferSize = localWorkgroupSize * elementSize + * transaction2NumberOfCopiesPerWorkitem; // irrelevant if + // aliased_local_mem + size_t numberOfLocalWorkgroups = 1111; + size_t transaction1GlobalBufferSize = + numberOfLocalWorkgroups * transaction1LocalBufferSize; + size_t transaction2GlobalBufferSize = + numberOfLocalWorkgroups * transaction2LocalBufferSize; + size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + + transaction1InBuffer = (void *)malloc(transaction1GlobalBufferSize); + transaction1OutBuffer = (void *)malloc(transaction1GlobalBufferSize); + transaction2InBuffer = (void *)malloc(transaction2GlobalBufferSize); + transaction2OutBuffer = (void *)malloc(transaction2GlobalBufferSize); + memset(transaction1OutBuffer, 0, transaction1GlobalBufferSize); + memset(transaction2OutBuffer, 0, transaction2GlobalBufferSize); + + cl_int transaction1CopiesPerWorkitemInt, transaction1CopiesPerWorkgroup, + transaction2CopiesPerWorkitemInt, transaction2CopiesPerWorkgroup; + transaction1CopiesPerWorkitemInt = + (int)transaction1NumberOfCopiesPerWorkitem; + transaction1CopiesPerWorkgroup = + (int)(transaction1NumberOfCopiesPerWorkitem * localWorkgroupSize); + transaction2CopiesPerWorkitemInt = + (int)transaction2NumberOfCopiesPerWorkitem; + transaction2CopiesPerWorkgroup = + (int)(transaction2NumberOfCopiesPerWorkitem * localWorkgroupSize); + + log_info( + "Global: %d, local %d. 1st Transaction: local buffer %db, global " + "buffer %db, each work group will copy %d elements and each work " + "item item will copy %d elements. 2nd Transaction: local buffer " + "%db, global buffer %db, each work group will copy %d elements and " + "each work item will copy %d elements\n", + (int)globalWorkgroupSize, (int)localWorkgroupSize, + (int)transaction1LocalBufferSize, (int)transaction1GlobalBufferSize, + transaction1CopiesPerWorkgroup, transaction1CopiesPerWorkitemInt, + (int)transaction2LocalBufferSize, (int)transaction2GlobalBufferSize, + transaction2CopiesPerWorkgroup, transaction2CopiesPerWorkitemInt); + + threads[0] = globalWorkgroupSize; + localThreads[0] = localWorkgroupSize; + + d = init_genrand(gRandomSeed); + generate_random_data( + vecType, transaction1GlobalBufferSize / get_explicit_type_size(vecType), + d, transaction1InBuffer); + if (!transaction1DstIsTransaction2Src) + { + generate_random_data(vecType, + transaction2GlobalBufferSize + / get_explicit_type_size(vecType), + d, transaction2InBuffer); + } + free_mtdata(d); + d = NULL; + + streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + transaction1GlobalBufferSize, + transaction1InBuffer, &error); + test_error(error, "Unable to create input buffer"); + streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + transaction1GlobalBufferSize, + transaction1OutBuffer, &error); + test_error(error, "Unable to create output buffer"); + if (!transaction1DstIsTransaction2Src) + { + streams[2] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + transaction2GlobalBufferSize, + transaction2InBuffer, &error); + test_error(error, "Unable to create input buffer"); + } + if (!transaction1SrcIsTransaction2Dst) + { + streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + transaction2GlobalBufferSize, + transaction2OutBuffer, &error); + test_error(error, "Unable to create output buffer"); + } + + cl_uint argIndex = 0; + error = clSetKernelArg(kernel, argIndex, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + error = clSetKernelArg(kernel, argIndex, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + if (!transaction1DstIsTransaction2Src) + { + error = + clSetKernelArg(kernel, argIndex, sizeof(streams[2]), &streams[2]); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + } + if (!transaction1SrcIsTransaction2Dst) + { + error = + clSetKernelArg(kernel, argIndex, sizeof(streams[3]), &streams[3]); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + } + error = clSetKernelArg(kernel, argIndex, transaction1LocalBufferSize, NULL); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + if (!aliased_local_mem) + { + error = + clSetKernelArg(kernel, argIndex, transaction2LocalBufferSize, NULL); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + } + error = + clSetKernelArg(kernel, argIndex, sizeof(transaction1CopiesPerWorkgroup), + &transaction1CopiesPerWorkgroup); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + error = clSetKernelArg(kernel, argIndex, + sizeof(transaction1CopiesPerWorkitemInt), + &transaction1CopiesPerWorkitemInt); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + error = + clSetKernelArg(kernel, argIndex, sizeof(transaction2CopiesPerWorkgroup), + &transaction2CopiesPerWorkgroup); + test_error(error, "Unable to set kernel argument"); + ++argIndex; + error = clSetKernelArg(kernel, argIndex, + sizeof(transaction2CopiesPerWorkitemInt), + &transaction2CopiesPerWorkitemInt); + test_error(error, "Unable to set kernel argument"); + + // Enqueue + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, NULL); + test_error(error, "Unable to queue kernel"); + + // Read + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, + transaction1GlobalBufferSize, + transaction1OutBuffer, 0, NULL, NULL); + test_error(error, "Unable to read results"); + if (transaction1DstIsTransaction2Src) + { + for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++) + { + memcpy( + (void *)((unsigned char *)transaction2InBuffer + + idx * transaction2CopiesPerWorkgroup * elementSize), + (const void *)((unsigned char *)transaction1OutBuffer + + (idx * transaction1CopiesPerWorkgroup + + (transaction1CopiesPerWorkgroup + - transaction2CopiesPerWorkgroup)) + * elementSize), + (size_t)transaction2CopiesPerWorkgroup * elementSize); + } + } + if (transaction1SrcIsTransaction2Dst) + { + void *transaction1SrcBuffer = + (void *)malloc(transaction1GlobalBufferSize); + error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, + transaction1GlobalBufferSize, + transaction1SrcBuffer, 0, NULL, NULL); + test_error(error, "Unable to read results"); + for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++) + { + memcpy( + (void *)((unsigned char *)transaction2OutBuffer + + idx * transaction2CopiesPerWorkgroup * elementSize), + (const void *)((unsigned char *)transaction1SrcBuffer + + (idx * transaction1CopiesPerWorkgroup + + (transaction1CopiesPerWorkgroup + - transaction2CopiesPerWorkgroup)) + * elementSize), + (size_t)transaction2CopiesPerWorkgroup * elementSize); + } + free(transaction1SrcBuffer); + } + else + { + error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, + transaction2GlobalBufferSize, + transaction2OutBuffer, 0, NULL, NULL); + test_error(error, "Unable to read results"); + } + + // Verify + int failuresPrinted = 0; + if (memcmp(transaction1InBuffer, transaction1OutBuffer, + transaction1GlobalBufferSize) + != 0) + { + size_t typeSize = get_explicit_type_size(vecType) * vecSize; + unsigned char *inchar = (unsigned char *)transaction1InBuffer; + unsigned char *outchar = (unsigned char *)transaction1OutBuffer; + for (int i = 0; i < (int)transaction1GlobalBufferSize; + i += (int)elementSize) + { + if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize) + != 0) + { + char values[4096]; + values[0] = 0; + if (failuresPrinted == 0) + { + // Print first failure message + log_error("ERROR: Results of 1st transaction did not " + "validate!\n"); + } + sprintf(values + strlen(values), "%d -> [", i); + for (int j = 0; j < (int)elementSize; j++) + sprintf(values + strlen(values), "%2x ", inchar[i + j]); + sprintf(values + strlen(values), "] != ["); + for (int j = 0; j < (int)elementSize; j++) + sprintf(values + strlen(values), "%2x ", outchar[i + j]); + sprintf(values + strlen(values), "]"); + log_error("%s\n", values); + failuresPrinted++; + } + + if (failuresPrinted > 5) + { + log_error("Not printing further failures...\n"); + break; + } + } + } + if (memcmp(transaction2InBuffer, transaction2OutBuffer, + transaction2GlobalBufferSize) + != 0) + { + size_t typeSize = get_explicit_type_size(vecType) * vecSize; + unsigned char *inchar = (unsigned char *)transaction2InBuffer; + unsigned char *outchar = (unsigned char *)transaction2OutBuffer; + for (int i = 0; i < (int)transaction2GlobalBufferSize; + i += (int)elementSize) + { + if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize) + != 0) + { + char values[4096]; + values[0] = 0; + if (failuresPrinted == 0) + { + // Print first failure message + log_error("ERROR: Results of 2nd transaction did not " + "validate!\n"); + } + sprintf(values + strlen(values), "%d -> [", i); + for (int j = 0; j < (int)elementSize; j++) + sprintf(values + strlen(values), "%2x ", inchar[i + j]); + sprintf(values + strlen(values), "] != ["); + for (int j = 0; j < (int)elementSize; j++) + sprintf(values + strlen(values), "%2x ", outchar[i + j]); + sprintf(values + strlen(values), "]"); + log_error("%s\n", values); + failuresPrinted++; + } + + if (failuresPrinted > 5) + { + log_error("Not printing further failures...\n"); + break; + } + } + } + + free(transaction1InBuffer); + free(transaction1OutBuffer); + free(transaction2InBuffer); + free(transaction2OutBuffer); + + return failuresPrinted ? -1 : 0; +} + +int test_copy_fence_all_types(cl_device_id deviceID, cl_context context, + cl_command_queue queue, const char *kernelCode, + bool export_after_import, bool aliased_local_mem, + bool aliased_global_mem) +{ + ExplicitType vecType[] = { + kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, + kULong, kFloat, kDouble, kNumExplicitTypes + }; + unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; + unsigned int size, typeIndex; + + int errors = 0; + + for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++) + { + if (vecType[typeIndex] == kDouble + && !is_extension_available(deviceID, "cl_khr_fp64")) + continue; + + if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong) + && !gHasLong) + continue; + + for (size = 0; vecSizes[size] != 0; size++) + { + if (test_copy_fence(deviceID, context, queue, kernelCode, + vecType[typeIndex], vecSizes[size], + export_after_import, aliased_local_mem, + aliased_global_mem)) + { + errors++; + } + } + } + if (errors) return -1; + return 0; +} + +int test_async_work_group_copy_fence_import_after_export_aliased_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) +{ + return test_copy_fence_all_types(deviceID, context, queue, + import_after_export_aliased_local_kernel, + false, true, false); +} + +int test_async_work_group_copy_fence_import_after_export_aliased_global( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) +{ + return test_copy_fence_all_types(deviceID, context, queue, + import_after_export_aliased_global_kernel, + false, false, true); +} + +int test_async_work_group_copy_fence_import_after_export_aliased_global_and_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) +{ + return test_copy_fence_all_types( + deviceID, context, queue, + import_after_export_aliased_global_and_local_kernel, false, true, true); +} + +int test_async_work_group_copy_fence_export_after_import_aliased_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) +{ + return test_copy_fence_all_types(deviceID, context, queue, + export_after_import_aliased_local_kernel, + true, true, false); +} + +int test_async_work_group_copy_fence_export_after_import_aliased_global( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) +{ + return test_copy_fence_all_types(deviceID, context, queue, + export_after_import_aliased_global_kernel, + true, false, true); +} + +int test_async_work_group_copy_fence_export_after_import_aliased_global_and_local( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) +{ + return test_copy_fence_all_types( + deviceID, context, queue, + export_after_import_aliased_global_and_local_kernel, true, true, true); +}