mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
Image streams optimization (#1616)
* Don't recalculate image parameters repeatedly in `test_read_image()` We've already done this in the loop. There's no need to recalculate those parameters over and over again in `sample_image_pixel*()` and `read_image_pixel*()`. This should save some work during the image streams test. This only affects the 3D tests for now, but my time profiles indicate this is where we spend the most time anyway. * Vectorize read_image_pixel_float() and sample_image_pixel_float() for SSE/AVX This shortens the image streams test time from 45 minutes without it to 37 minutes. Unfortunately, most of the time is now spent waiting for memory, particularly in the 3D tests, because the 3D image doesn't neatly fit in the cache, especially in the linear sampling case, where pixels from two 2D slices must be sampled. Software prefetching won't help; it only helps when execution time is dominated by operations, but this is dominated by memory access. Randomized offsets are likely a factor, because they throw off the hardware prefetcher. One possible further optimization is, in the linear sampling case, to load two sampled pixels at once. This is easy to do using AVX, which extends SSE with 256-bit vectors. Obviously, this only applies to x86 CPUs with SSE2. The greatest performance gains, however, are seen with SSE4.1. Most modern x86 CPus have SSE4. Work is needed to support other CPUs' vector units--ARM Advanced SIMD/NEON is probably the most important one. Another possibility is arranging the code so that the compiler's autovectorization will kick in and do what I did here manually.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -536,16 +536,20 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
}
|
||||
|
||||
int nextLevelOffset = 0;
|
||||
// Precalculate LOD dimensions for sample_image_pixel_offset()
|
||||
size_t width_lod = width_size, height_lod = height_size,
|
||||
depth_lod = depth_size;
|
||||
image_descriptor lodInfo = *imageInfo;
|
||||
lodInfo.num_mip_levels = 1;
|
||||
|
||||
// Loop over all mipmap levels, if we are testing mipmapped images.
|
||||
for (int lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels)
|
||||
|| (!gTestMipmaps && lod < 1);
|
||||
lod++)
|
||||
{
|
||||
size_t image_lod_size = get_image_num_pixels(
|
||||
imageInfo, width_lod, height_lod, depth_lod, imageInfo->arraySize);
|
||||
size_t image_lod_size =
|
||||
get_image_num_pixels(&lodInfo, lodInfo.width, lodInfo.height,
|
||||
lodInfo.depth, lodInfo.arraySize);
|
||||
test_assert_error(0 != image_lod_size, "Invalid image size");
|
||||
size_t resultValuesSize =
|
||||
image_lod_size * get_explicit_type_size(outputType) * 4;
|
||||
@@ -565,11 +569,11 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
|
||||
// Init the coordinates
|
||||
error = InitFloatCoordsCommon(
|
||||
imageInfo, imageSampler, xOffsetValues, yOffsetValues,
|
||||
&lodInfo, imageSampler, xOffsetValues, yOffsetValues,
|
||||
zOffsetValues, q >= float_offset_count ? -offset : offset,
|
||||
q >= float_offset_count ? offset : -offset,
|
||||
q >= float_offset_count ? -offset : offset,
|
||||
imageSampler->normalized_coords, d, lod);
|
||||
imageSampler->normalized_coords, d, 0);
|
||||
test_error(error, "Unable to initialise coordinates");
|
||||
|
||||
error = clEnqueueWriteBuffer(queue, xOffsets, CL_TRUE, 0,
|
||||
@@ -601,10 +605,9 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
test_error(error, "Unable to run kernel");
|
||||
|
||||
// Get results
|
||||
error = clEnqueueReadBuffer(
|
||||
queue, results, CL_TRUE, 0,
|
||||
image_lod_size * get_explicit_type_size(outputType) * 4,
|
||||
resultValues, 0, NULL, NULL);
|
||||
error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0,
|
||||
resultValuesSize, resultValues, 0, NULL,
|
||||
NULL);
|
||||
test_error(error, "Unable to read results from kernel");
|
||||
if (gDebugTrace) log_info(" results read\n");
|
||||
|
||||
@@ -668,13 +671,13 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
int hasDenormals = 0;
|
||||
FloatPixel maxPixel =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j], norm_offset_x,
|
||||
norm_offset_y, norm_offset_z,
|
||||
imageSampler, expected, 0,
|
||||
&hasDenormals, lod);
|
||||
&hasDenormals, 0);
|
||||
|
||||
float err1 =
|
||||
ABS_ERROR(sRGBmap(resultPtr[0]),
|
||||
@@ -726,7 +729,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
|
||||
maxPixel =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
@@ -734,7 +737,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z,
|
||||
imageSampler, expected,
|
||||
0, NULL, lod);
|
||||
0, NULL, 0);
|
||||
|
||||
err1 = ABS_ERROR(
|
||||
sRGBmap(resultPtr[0]),
|
||||
@@ -788,7 +791,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
int hasDenormals = 0;
|
||||
FloatPixel maxPixel =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
@@ -796,7 +799,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z, imageSampler,
|
||||
expected, 0, &hasDenormals,
|
||||
lod);
|
||||
0);
|
||||
|
||||
float err1 =
|
||||
ABS_ERROR(sRGBmap(resultPtr[0]),
|
||||
@@ -827,13 +830,13 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
|
||||
maxPixel =
|
||||
sample_image_pixel_float(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
imageSampler,
|
||||
expected, 0, NULL,
|
||||
lod);
|
||||
0);
|
||||
|
||||
err1 = ABS_ERROR(
|
||||
sRGBmap(resultPtr[0]),
|
||||
@@ -866,7 +869,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
shouldReturn |=
|
||||
determine_validation_error_offset<
|
||||
float>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
imageSampler, resultPtr,
|
||||
expected, error,
|
||||
xOffsetValues[j],
|
||||
@@ -876,11 +879,11 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z, j,
|
||||
numTries, numClamped,
|
||||
true, lod);
|
||||
true, 0);
|
||||
log_error("Step by step:\n");
|
||||
FloatPixel temp =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
@@ -889,7 +892,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_z,
|
||||
imageSampler, tempOut,
|
||||
1 /*verbose*/,
|
||||
&hasDenormals, lod);
|
||||
&hasDenormals, 0);
|
||||
log_error(
|
||||
"\tulps: %2.2f, %2.2f, "
|
||||
"%2.2f, %2.2f (max "
|
||||
@@ -986,13 +989,13 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
int hasDenormals = 0;
|
||||
FloatPixel maxPixel =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j], norm_offset_x,
|
||||
norm_offset_y, norm_offset_z,
|
||||
imageSampler, expected, 0,
|
||||
&hasDenormals, lod);
|
||||
&hasDenormals, 0);
|
||||
|
||||
float err1 = ABS_ERROR(resultPtr[0],
|
||||
expected[0]);
|
||||
@@ -1051,7 +1054,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
|
||||
maxPixel =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
@@ -1059,7 +1062,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z,
|
||||
imageSampler, expected,
|
||||
0, NULL, lod);
|
||||
0, NULL, 0);
|
||||
|
||||
err1 = ABS_ERROR(resultPtr[0],
|
||||
expected[0]);
|
||||
@@ -1110,7 +1113,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
int hasDenormals = 0;
|
||||
FloatPixel maxPixel =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
@@ -1118,7 +1121,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z, imageSampler,
|
||||
expected, 0, &hasDenormals,
|
||||
lod);
|
||||
0);
|
||||
|
||||
float err1 = ABS_ERROR(resultPtr[0],
|
||||
expected[0]);
|
||||
@@ -1157,13 +1160,13 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
|
||||
maxPixel =
|
||||
sample_image_pixel_float(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
imageSampler,
|
||||
expected, 0, NULL,
|
||||
lod);
|
||||
0);
|
||||
|
||||
err1 =
|
||||
ABS_ERROR(resultPtr[0],
|
||||
@@ -1196,7 +1199,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
shouldReturn |=
|
||||
determine_validation_error_offset<
|
||||
float>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
imageSampler, resultPtr,
|
||||
expected, error,
|
||||
xOffsetValues[j],
|
||||
@@ -1206,11 +1209,11 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z, j,
|
||||
numTries, numClamped,
|
||||
true, lod);
|
||||
true, 0);
|
||||
log_error("Step by step:\n");
|
||||
FloatPixel temp =
|
||||
sample_image_pixel_float_offset(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j],
|
||||
@@ -1219,7 +1222,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_z,
|
||||
imageSampler, tempOut,
|
||||
1 /*verbose*/,
|
||||
&hasDenormals, lod);
|
||||
&hasDenormals, 0);
|
||||
log_error(
|
||||
"\tulps: %2.2f, %2.2f, "
|
||||
"%2.2f, %2.2f (max "
|
||||
@@ -1314,11 +1317,11 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
}
|
||||
|
||||
sample_image_pixel_offset<unsigned int>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j], yOffsetValues[j],
|
||||
zOffsetValues[j], norm_offset_x,
|
||||
norm_offset_y, norm_offset_z,
|
||||
imageSampler, expected, lod);
|
||||
imageSampler, expected, 0);
|
||||
|
||||
error = errMax(
|
||||
errMax(abs_diff_uint(expected[0],
|
||||
@@ -1380,12 +1383,12 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
|
||||
sample_image_pixel_offset<
|
||||
unsigned int>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j], norm_offset_x,
|
||||
norm_offset_y, norm_offset_z,
|
||||
imageSampler, expected, lod);
|
||||
imageSampler, expected, 0);
|
||||
|
||||
error = errMax(
|
||||
errMax(
|
||||
@@ -1412,7 +1415,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
shouldReturn |=
|
||||
determine_validation_error_offset<
|
||||
unsigned int>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
imageSampler, resultPtr,
|
||||
expected, error,
|
||||
xOffsetValues[j],
|
||||
@@ -1422,7 +1425,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z, j,
|
||||
numTries, numClamped,
|
||||
false, lod);
|
||||
false, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1497,11 +1500,11 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
}
|
||||
|
||||
sample_image_pixel_offset<int>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j], yOffsetValues[j],
|
||||
zOffsetValues[j], norm_offset_x,
|
||||
norm_offset_y, norm_offset_z,
|
||||
imageSampler, expected, lod);
|
||||
imageSampler, expected, 0);
|
||||
|
||||
error = errMax(
|
||||
errMax(abs_diff_int(expected[0],
|
||||
@@ -1563,12 +1566,12 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
}
|
||||
|
||||
sample_image_pixel_offset<int>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
xOffsetValues[j],
|
||||
yOffsetValues[j],
|
||||
zOffsetValues[j], norm_offset_x,
|
||||
norm_offset_y, norm_offset_z,
|
||||
imageSampler, expected, lod);
|
||||
imageSampler, expected, 0);
|
||||
|
||||
error = errMax(
|
||||
errMax(
|
||||
@@ -1594,7 +1597,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
shouldReturn |=
|
||||
determine_validation_error_offset<
|
||||
int>(
|
||||
imagePtr, imageInfo,
|
||||
imagePtr, &lodInfo,
|
||||
imageSampler, resultPtr,
|
||||
expected, error,
|
||||
xOffsetValues[j],
|
||||
@@ -1604,7 +1607,7 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
norm_offset_y,
|
||||
norm_offset_z, j,
|
||||
numTries, numClamped,
|
||||
false, lod);
|
||||
false, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1626,17 +1629,23 @@ int test_read_image(cl_context context, cl_command_queue queue,
|
||||
}
|
||||
}
|
||||
{
|
||||
nextLevelOffset += width_lod * height_lod * depth_lod
|
||||
* get_pixel_size(imageInfo->format);
|
||||
width_lod = (width_lod >> 1) ? (width_lod >> 1) : 1;
|
||||
nextLevelOffset +=
|
||||
image_lod_size * get_pixel_size(imageInfo->format);
|
||||
width_lod = lodInfo.width =
|
||||
(lodInfo.width >> 1) ? (lodInfo.width >> 1) : 1;
|
||||
if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
|
||||
{
|
||||
height_lod = (height_lod >> 1) ? (height_lod >> 1) : 1;
|
||||
}
|
||||
height_lod = lodInfo.height =
|
||||
(lodInfo.height >> 1) ? (lodInfo.height >> 1) : 1;
|
||||
if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
|
||||
{
|
||||
depth_lod = (depth_lod >> 1) ? (depth_lod >> 1) : 1;
|
||||
}
|
||||
depth_lod = lodInfo.depth =
|
||||
(lodInfo.depth >> 1) ? (lodInfo.depth >> 1) : 1;
|
||||
lodInfo.rowPitch =
|
||||
lodInfo.width * get_pixel_size(imageInfo->format);
|
||||
if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
|
||||
lodInfo.slicePitch = lodInfo.rowPitch;
|
||||
else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
|
||||
|| imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
|
||||
lodInfo.slicePitch = lodInfo.rowPitch * lodInfo.height;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user