From a37884fe4461362c39a444f39402baebac3e713b Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Tue, 19 Jul 2022 17:43:36 +0100 Subject: [PATCH] Update cl_khr_extended_async_copies tests to the latest extension version (#1426) * Update cl_khr_extended_async_copies tests to the latest version of the extension Update the 2D and 3D extended async copies tests. Previously they were based on an older provisional version of the extension. Also update the variable names to only use 'stride' to refer to the actual stride values. Previously the tests used 'stride' to refer to the end of one line or plane and the start of the next. This is not the commonly understood meaning. * Address cl_khr_extended_async_copies PR feedback * Remove unnecessary parenthesis in kernel code * Make variables `const` and rearrange so that we can reuse variables, rather than repeating expressions. * Add in missing vector size of 3 for 2D tests * Use C++ String literals for kernel code Rather than C strings use C++11 string literals to define the kernel code in the extended async-copy tests. Doing this makes the kernel code more readable. Co-authored-by: Ewan Crawford --- test_conformance/basic/test_async_copy2D.cpp | 236 ++++++------- test_conformance/basic/test_async_copy3D.cpp | 329 +++++++++---------- 2 files changed, 287 insertions(+), 278 deletions(-) diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp index fafcac83..54633a31 100644 --- a/test_conformance/basic/test_async_copy2D.cpp +++ b/test_conformance/basic/test_async_copy2D.cpp @@ -25,77 +25,81 @@ #include "../../test_common/harness/conversions.h" #include "procs.h" -static const char *async_global_to_local_kernel2D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int " - "lineCopiesPerWorkItem, int srcStride, int dstStride )\n" - "{\n" - " int i, j;\n" - // Zero the local storage first - " for(i=0; i max_local_workgroup_size[0]) max_workgroup_size = max_local_workgroup_size[0]; - size_t numElementsPerLine = 10; - size_t lineCopiesPerWorkItem = 13; + const size_t numElementsPerLine = 10; + const cl_int dstStride = numElementsPerLine + dstMargin; + const cl_int srcStride = numElementsPerLine + srcMargin; + elementSize = get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); - size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize - * (numElementsPerLine + (localIsDst ? dstStride : srcStride)); + + const size_t lineCopiesPerWorkItem = 13; + const size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem + * elementSize * (localIsDst ? dstStride : srcStride); + size_t maxLocalWorkgroupSize = (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); @@ -199,34 +208,39 @@ int test_copy2D(cl_device_id deviceID, cl_context context, if (maxLocalWorkgroupSize > max_workgroup_size) localWorkgroupSize = max_workgroup_size; - size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride) - / (numElementsPerLine + srcStride); - size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride) - / (numElementsPerLine + dstStride); - size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut); - size_t maxLocalWorkgroups = + + const size_t maxTotalLinesIn = + (max_alloc_size / elementSize + srcMargin) / srcStride; + const size_t maxTotalLinesOut = + (max_alloc_size / elementSize + dstMargin) / dstStride; + const size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut); + const size_t maxLocalWorkgroups = maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem); - size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem - - (localIsDst ? dstStride : srcStride); - size_t numberOfLocalWorkgroups = std::min(1111, (int)maxLocalWorkgroups); - size_t totalLines = + const size_t localBufferSize = + localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstMargin : srcMargin); + const size_t numberOfLocalWorkgroups = + std::min(1111, (int)maxLocalWorkgroups); + const size_t totalLines = numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem; - size_t inBufferSize = elementSize - * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride); - size_t outBufferSize = elementSize - * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride); - size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + const size_t inBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * srcMargin); + const size_t outBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * dstMargin); + const size_t globalWorkgroupSize = + numberOfLocalWorkgroups * localWorkgroupSize; inBuffer = (void *)malloc(inBufferSize); outBuffer = (void *)malloc(outBufferSize); outBufferCopy = (void *)malloc(outBufferSize); - cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt, - lineCopiesPerWorkgroup; - lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem; - numElementsPerLineInt = (int)numElementsPerLine; - lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize); + const cl_int lineCopiesPerWorkItemInt = + static_cast(lineCopiesPerWorkItem); + const cl_int numElementsPerLineInt = + static_cast(numElementsPerLine); + const cl_int lineCopiesPerWorkgroup = + static_cast(lineCopiesPerWorkItem * localWorkgroupSize); log_info( "Global: %d, local %d, local buffer %db, global in buffer %db, " @@ -296,8 +310,8 @@ int test_copy2D(cl_device_id deviceID, cl_context context, for (int j = 0; j < (int)numElementsPerLine * elementSize; j += elementSize) { - int inIdx = i * (numElementsPerLine + srcStride) + j; - int outIdx = i * (numElementsPerLine + dstStride) + j; + int inIdx = i * srcStride + j; + int outIdx = i * dstStride + j; if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, typeSize) != 0) @@ -332,11 +346,10 @@ int test_copy2D(cl_device_id deviceID, cl_context context, if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1) * elementSize) { - int outIdx = i * (numElementsPerLine + dstStride) - + numElementsPerLine * elementSize; + int outIdx = i * dstStride + numElementsPerLine * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstStride * elementSize) + dstMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -373,9 +386,12 @@ int test_copy2D_all_types(cl_device_id deviceID, cl_context context, kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + // The margins below represent the number of elements between the end of + // one line and the start of the next. The strides are equivalent to the + // length of the line plus the chosen margin. unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; - unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; - unsigned int size, typeIndex, srcStride, dstStride; + unsigned int smallTypesMarginSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcMargin, dstMargin; int errors = 0; @@ -401,19 +417,19 @@ int test_copy2D_all_types(cl_device_id deviceID, cl_context context, if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] <= 2) // small type { - for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcStride++) + for (srcMargin = 0; srcMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcMargin++) { - for (dstStride = 0; - dstStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstStride++) + for (dstMargin = 0; + dstMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstMargin++) { if (test_copy2D(deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], - smallTypesStrideSizes[srcStride], - smallTypesStrideSizes[dstStride], + smallTypesMarginSizes[srcMargin], + smallTypesMarginSizes[dstMargin], localIsDst)) { errors++; diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp index 2b184ee5..5eb41ebc 100644 --- a/test_conformance/basic/test_async_copy3D.cpp +++ b/test_conformance/basic/test_async_copy3D.cpp @@ -25,96 +25,95 @@ #include "../../test_common/harness/conversions.h" #include "procs.h" -static const char *async_global_to_local_kernel3D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int numLines, int " - "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, " - "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n" - "{\n" - " int i, j, k;\n" - // Zero the local storage first - " for(i=0; i max_local_workgroup_size[0]) max_workgroup_size = max_local_workgroup_size[0]; - size_t numElementsPerLine = 10; - size_t numLines = 13; - size_t planesCopiesPerWorkItem = 2; + const size_t numElementsPerLine = 10; + const cl_int dstLineStride = numElementsPerLine + dstLineMargin; + const cl_int srcLineStride = numElementsPerLine + srcLineMargin; + + const size_t numLines = 13; + const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin; + const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin; + elementSize = get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); - size_t localStorageSpacePerWorkitem = elementSize - * (planesCopiesPerWorkItem - * (numLines * numElementsPerLine - + numLines * (localIsDst ? dstLineStride : srcLineStride) - + (localIsDst ? dstPlaneStride : srcPlaneStride))); + const size_t planesCopiesPerWorkItem = 2; + const size_t localStorageSpacePerWorkitem = elementSize + * planesCopiesPerWorkItem + * (localIsDst ? dstPlaneStride : srcPlaneStride); size_t maxLocalWorkgroupSize = (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); @@ -224,42 +227,41 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (maxLocalWorkgroupSize > max_workgroup_size) localWorkgroupSize = max_workgroup_size; - size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride) - / ((numLines * numElementsPerLine + numLines * srcLineStride) - + srcPlaneStride); - size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride) - / ((numLines * numElementsPerLine + numLines * dstLineStride) - + dstPlaneStride); - size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut); - size_t maxLocalWorkgroups = + const size_t maxTotalPlanesIn = + ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride; + const size_t maxTotalPlanesOut = + ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride; + const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut); + const size_t maxLocalWorkgroups = maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem); - size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem - - (localIsDst ? dstPlaneStride : srcPlaneStride); - size_t numberOfLocalWorkgroups = std::min(1111, (int)maxLocalWorkgroups); - size_t totalPlanes = + const size_t localBufferSize = + localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstPlaneMargin : srcPlaneMargin); + const size_t numberOfLocalWorkgroups = + std::min(1111, (int)maxLocalWorkgroups); + const size_t totalPlanes = numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem; - size_t inBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * srcLineStride) - + (totalPlanes - 1) * srcPlaneStride); - size_t outBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * dstLineStride) - + (totalPlanes - 1) * dstPlaneStride); - size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + const size_t inBufferSize = elementSize + * (totalPlanes * numLines * srcLineStride + + (totalPlanes - 1) * srcPlaneMargin); + const size_t outBufferSize = elementSize + * (totalPlanes * numLines * dstLineStride + + (totalPlanes - 1) * dstPlaneMargin); + const size_t globalWorkgroupSize = + numberOfLocalWorkgroups * localWorkgroupSize; inBuffer = (void *)malloc(inBufferSize); outBuffer = (void *)malloc(outBufferSize); outBufferCopy = (void *)malloc(outBufferSize); - cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt, - planesCopiesPerWorkgroup; - planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem; - numElementsPerLineInt = (int)numElementsPerLine; - numLinesInt = (int)numLines; - planesCopiesPerWorkgroup = - (int)(planesCopiesPerWorkItem * localWorkgroupSize); + const cl_int planesCopiesPerWorkItemInt = + static_cast(planesCopiesPerWorkItem); + const cl_int numElementsPerLineInt = + static_cast(numElementsPerLine); + const cl_int numLinesInt = static_cast(numLines); + const cl_int planesCopiesPerWorkgroup = + static_cast(planesCopiesPerWorkItem * localWorkgroupSize); log_info("Global: %d, local %d, local buffer %db, global in buffer %db, " "global out buffer %db, each work group will copy %d planes and " @@ -336,14 +338,8 @@ int test_copy3D(cl_device_id deviceID, cl_context context, for (int k = 0; k < (int)numElementsPerLine * elementSize; k += elementSize) { - int inIdx = i - * (numLines * numElementsPerLine - + numLines * srcLineStride + srcPlaneStride) - + j * (numElementsPerLine + srcLineStride) + k; - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + k; + int inIdx = i * srcPlaneStride + j * srcLineStride + k; + int outIdx = i * dstPlaneStride + j * dstLineStride + k; if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, typeSize) != 0) @@ -378,14 +374,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, } if (j < (int)numLines * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + int outIdx = i * dstPlaneStride + j * dstLineStride + numElementsPerLine * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstLineStride * elementSize) + dstLineMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -409,14 +402,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1) * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine + numLines * dstLineStride - + dstPlaneStride) - + (numLines * elementSize) * (numElementsPerLine) - + (numLines * elementSize) * (dstLineStride); + int outIdx = + i * dstPlaneStride + numLines * dstLineStride * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstPlaneStride * elementSize) + dstPlaneMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -453,10 +443,13 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + // The margins below represent the number of elements between the end of + // one line or plane and the start of the next. The strides are equivalent + // to the size of the line or plane plus the chosen margin. unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; - unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; - unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride, - dstPlaneStride; + unsigned int smallTypesMarginSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcLineMargin, dstLineMargin, srcPlaneMargin, + dstPlaneMargin; int errors = 0; @@ -482,33 +475,33 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] <= 2) // small type { - for (srcLineStride = 0; - srcLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcLineStride++) + for (srcLineMargin = 0; + srcLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcLineMargin++) { - for (dstLineStride = 0; - dstLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstLineStride++) + for (dstLineMargin = 0; + dstLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstLineMargin++) { - for (srcPlaneStride = 0; - srcPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcPlaneStride++) + for (srcPlaneMargin = 0; + srcPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcPlaneMargin++) { - for (dstPlaneStride = 0; - dstPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstPlaneStride++) + for (dstPlaneMargin = 0; + dstPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstPlaneMargin++) { if (test_copy3D( deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], - smallTypesStrideSizes[srcLineStride], - smallTypesStrideSizes[dstLineStride], - smallTypesStrideSizes[srcPlaneStride], - smallTypesStrideSizes[dstPlaneStride], + smallTypesMarginSizes[srcLineMargin], + smallTypesMarginSizes[dstLineMargin], + smallTypesMarginSizes[srcPlaneMargin], + smallTypesMarginSizes[dstPlaneMargin], localIsDst)) { errors++;