diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp index fafcac83..54633a31 100644 --- a/test_conformance/basic/test_async_copy2D.cpp +++ b/test_conformance/basic/test_async_copy2D.cpp @@ -25,77 +25,81 @@ #include "../../test_common/harness/conversions.h" #include "procs.h" -static const char *async_global_to_local_kernel2D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int " - "lineCopiesPerWorkItem, int srcStride, int dstStride )\n" - "{\n" - " int i, j;\n" - // Zero the local storage first - " for(i=0; i max_local_workgroup_size[0]) max_workgroup_size = max_local_workgroup_size[0]; - size_t numElementsPerLine = 10; - size_t lineCopiesPerWorkItem = 13; + const size_t numElementsPerLine = 10; + const cl_int dstStride = numElementsPerLine + dstMargin; + const cl_int srcStride = numElementsPerLine + srcMargin; + elementSize = get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); - size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize - * (numElementsPerLine + (localIsDst ? dstStride : srcStride)); + + const size_t lineCopiesPerWorkItem = 13; + const size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem + * elementSize * (localIsDst ? dstStride : srcStride); + size_t maxLocalWorkgroupSize = (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); @@ -199,34 +208,39 @@ int test_copy2D(cl_device_id deviceID, cl_context context, if (maxLocalWorkgroupSize > max_workgroup_size) localWorkgroupSize = max_workgroup_size; - size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride) - / (numElementsPerLine + srcStride); - size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride) - / (numElementsPerLine + dstStride); - size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut); - size_t maxLocalWorkgroups = + + const size_t maxTotalLinesIn = + (max_alloc_size / elementSize + srcMargin) / srcStride; + const size_t maxTotalLinesOut = + (max_alloc_size / elementSize + dstMargin) / dstStride; + const size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut); + const size_t maxLocalWorkgroups = maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem); - size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem - - (localIsDst ? dstStride : srcStride); - size_t numberOfLocalWorkgroups = std::min(1111, (int)maxLocalWorkgroups); - size_t totalLines = + const size_t localBufferSize = + localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstMargin : srcMargin); + const size_t numberOfLocalWorkgroups = + std::min(1111, (int)maxLocalWorkgroups); + const size_t totalLines = numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem; - size_t inBufferSize = elementSize - * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride); - size_t outBufferSize = elementSize - * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride); - size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + const size_t inBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * srcMargin); + const size_t outBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * dstMargin); + const size_t globalWorkgroupSize = + numberOfLocalWorkgroups * localWorkgroupSize; inBuffer = (void *)malloc(inBufferSize); outBuffer = (void *)malloc(outBufferSize); outBufferCopy = (void *)malloc(outBufferSize); - cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt, - lineCopiesPerWorkgroup; - lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem; - numElementsPerLineInt = (int)numElementsPerLine; - lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize); + const cl_int lineCopiesPerWorkItemInt = + static_cast(lineCopiesPerWorkItem); + const cl_int numElementsPerLineInt = + static_cast(numElementsPerLine); + const cl_int lineCopiesPerWorkgroup = + static_cast(lineCopiesPerWorkItem * localWorkgroupSize); log_info( "Global: %d, local %d, local buffer %db, global in buffer %db, " @@ -296,8 +310,8 @@ int test_copy2D(cl_device_id deviceID, cl_context context, for (int j = 0; j < (int)numElementsPerLine * elementSize; j += elementSize) { - int inIdx = i * (numElementsPerLine + srcStride) + j; - int outIdx = i * (numElementsPerLine + dstStride) + j; + int inIdx = i * srcStride + j; + int outIdx = i * dstStride + j; if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, typeSize) != 0) @@ -332,11 +346,10 @@ int test_copy2D(cl_device_id deviceID, cl_context context, if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1) * elementSize) { - int outIdx = i * (numElementsPerLine + dstStride) - + numElementsPerLine * elementSize; + int outIdx = i * dstStride + numElementsPerLine * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstStride * elementSize) + dstMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -373,9 +386,12 @@ int test_copy2D_all_types(cl_device_id deviceID, cl_context context, kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + // The margins below represent the number of elements between the end of + // one line and the start of the next. The strides are equivalent to the + // length of the line plus the chosen margin. unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; - unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; - unsigned int size, typeIndex, srcStride, dstStride; + unsigned int smallTypesMarginSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcMargin, dstMargin; int errors = 0; @@ -401,19 +417,19 @@ int test_copy2D_all_types(cl_device_id deviceID, cl_context context, if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] <= 2) // small type { - for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcStride++) + for (srcMargin = 0; srcMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcMargin++) { - for (dstStride = 0; - dstStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstStride++) + for (dstMargin = 0; + dstMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstMargin++) { if (test_copy2D(deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], - smallTypesStrideSizes[srcStride], - smallTypesStrideSizes[dstStride], + smallTypesMarginSizes[srcMargin], + smallTypesMarginSizes[dstMargin], localIsDst)) { errors++; diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp index 2b184ee5..5eb41ebc 100644 --- a/test_conformance/basic/test_async_copy3D.cpp +++ b/test_conformance/basic/test_async_copy3D.cpp @@ -25,96 +25,95 @@ #include "../../test_common/harness/conversions.h" #include "procs.h" -static const char *async_global_to_local_kernel3D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int numLines, int " - "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, " - "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n" - "{\n" - " int i, j, k;\n" - // Zero the local storage first - " for(i=0; i max_local_workgroup_size[0]) max_workgroup_size = max_local_workgroup_size[0]; - size_t numElementsPerLine = 10; - size_t numLines = 13; - size_t planesCopiesPerWorkItem = 2; + const size_t numElementsPerLine = 10; + const cl_int dstLineStride = numElementsPerLine + dstLineMargin; + const cl_int srcLineStride = numElementsPerLine + srcLineMargin; + + const size_t numLines = 13; + const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin; + const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin; + elementSize = get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); - size_t localStorageSpacePerWorkitem = elementSize - * (planesCopiesPerWorkItem - * (numLines * numElementsPerLine - + numLines * (localIsDst ? dstLineStride : srcLineStride) - + (localIsDst ? dstPlaneStride : srcPlaneStride))); + const size_t planesCopiesPerWorkItem = 2; + const size_t localStorageSpacePerWorkitem = elementSize + * planesCopiesPerWorkItem + * (localIsDst ? dstPlaneStride : srcPlaneStride); size_t maxLocalWorkgroupSize = (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); @@ -224,42 +227,41 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (maxLocalWorkgroupSize > max_workgroup_size) localWorkgroupSize = max_workgroup_size; - size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride) - / ((numLines * numElementsPerLine + numLines * srcLineStride) - + srcPlaneStride); - size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride) - / ((numLines * numElementsPerLine + numLines * dstLineStride) - + dstPlaneStride); - size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut); - size_t maxLocalWorkgroups = + const size_t maxTotalPlanesIn = + ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride; + const size_t maxTotalPlanesOut = + ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride; + const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut); + const size_t maxLocalWorkgroups = maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem); - size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem - - (localIsDst ? dstPlaneStride : srcPlaneStride); - size_t numberOfLocalWorkgroups = std::min(1111, (int)maxLocalWorkgroups); - size_t totalPlanes = + const size_t localBufferSize = + localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstPlaneMargin : srcPlaneMargin); + const size_t numberOfLocalWorkgroups = + std::min(1111, (int)maxLocalWorkgroups); + const size_t totalPlanes = numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem; - size_t inBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * srcLineStride) - + (totalPlanes - 1) * srcPlaneStride); - size_t outBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * dstLineStride) - + (totalPlanes - 1) * dstPlaneStride); - size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + const size_t inBufferSize = elementSize + * (totalPlanes * numLines * srcLineStride + + (totalPlanes - 1) * srcPlaneMargin); + const size_t outBufferSize = elementSize + * (totalPlanes * numLines * dstLineStride + + (totalPlanes - 1) * dstPlaneMargin); + const size_t globalWorkgroupSize = + numberOfLocalWorkgroups * localWorkgroupSize; inBuffer = (void *)malloc(inBufferSize); outBuffer = (void *)malloc(outBufferSize); outBufferCopy = (void *)malloc(outBufferSize); - cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt, - planesCopiesPerWorkgroup; - planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem; - numElementsPerLineInt = (int)numElementsPerLine; - numLinesInt = (int)numLines; - planesCopiesPerWorkgroup = - (int)(planesCopiesPerWorkItem * localWorkgroupSize); + const cl_int planesCopiesPerWorkItemInt = + static_cast(planesCopiesPerWorkItem); + const cl_int numElementsPerLineInt = + static_cast(numElementsPerLine); + const cl_int numLinesInt = static_cast(numLines); + const cl_int planesCopiesPerWorkgroup = + static_cast(planesCopiesPerWorkItem * localWorkgroupSize); log_info("Global: %d, local %d, local buffer %db, global in buffer %db, " "global out buffer %db, each work group will copy %d planes and " @@ -336,14 +338,8 @@ int test_copy3D(cl_device_id deviceID, cl_context context, for (int k = 0; k < (int)numElementsPerLine * elementSize; k += elementSize) { - int inIdx = i - * (numLines * numElementsPerLine - + numLines * srcLineStride + srcPlaneStride) - + j * (numElementsPerLine + srcLineStride) + k; - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + k; + int inIdx = i * srcPlaneStride + j * srcLineStride + k; + int outIdx = i * dstPlaneStride + j * dstLineStride + k; if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, typeSize) != 0) @@ -378,14 +374,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, } if (j < (int)numLines * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + int outIdx = i * dstPlaneStride + j * dstLineStride + numElementsPerLine * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstLineStride * elementSize) + dstLineMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -409,14 +402,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1) * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine + numLines * dstLineStride - + dstPlaneStride) - + (numLines * elementSize) * (numElementsPerLine) - + (numLines * elementSize) * (dstLineStride); + int outIdx = + i * dstPlaneStride + numLines * dstLineStride * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstPlaneStride * elementSize) + dstPlaneMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -453,10 +443,13 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + // The margins below represent the number of elements between the end of + // one line or plane and the start of the next. The strides are equivalent + // to the size of the line or plane plus the chosen margin. unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; - unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; - unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride, - dstPlaneStride; + unsigned int smallTypesMarginSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcLineMargin, dstLineMargin, srcPlaneMargin, + dstPlaneMargin; int errors = 0; @@ -482,33 +475,33 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] <= 2) // small type { - for (srcLineStride = 0; - srcLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcLineStride++) + for (srcLineMargin = 0; + srcLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcLineMargin++) { - for (dstLineStride = 0; - dstLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstLineStride++) + for (dstLineMargin = 0; + dstLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstLineMargin++) { - for (srcPlaneStride = 0; - srcPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcPlaneStride++) + for (srcPlaneMargin = 0; + srcPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcPlaneMargin++) { - for (dstPlaneStride = 0; - dstPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstPlaneStride++) + for (dstPlaneMargin = 0; + dstPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstPlaneMargin++) { if (test_copy3D( deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], - smallTypesStrideSizes[srcLineStride], - smallTypesStrideSizes[dstLineStride], - smallTypesStrideSizes[srcPlaneStride], - smallTypesStrideSizes[dstPlaneStride], + smallTypesMarginSizes[srcLineMargin], + smallTypesMarginSizes[dstLineMargin], + smallTypesMarginSizes[srcPlaneMargin], + smallTypesMarginSizes[dstPlaneMargin], localIsDst)) { errors++;