diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp index f1694e88..b95447a6 100644 --- a/test_common/harness/imageHelpers.cpp +++ b/test_common/harness/imageHelpers.cpp @@ -28,6 +28,41 @@ #if !defined(_WIN32) #include #endif +#ifdef __SSE2__ +#include +#if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 12) \ + || (defined(__clang__) && !defined(__apple_build_version__) \ + && __clang_major__ < 8) +// Add missing intrinsics that aren't in ancient compilers, but are used below +#ifdef __clang__ +#define NODEBUG nodebug +#else +#define NODEBUG artificial +#endif +static inline __attribute__((always_inline, NODEBUG)) __m128i +_mm_loadu_si16(const void *p) +{ + struct _loadu_si16 + { + short v; + } __attribute__((packed, may_alias)); + short u = ((const struct _loadu_si16 *)p)->v; + return _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, u); +} + +static inline __attribute__((always_inline, NODEBUG)) __m128i +_mm_loadu_si32(const void *p) +{ + struct _loadu_si32 + { + int v; + } __attribute__((packed, may_alias)); + int u = ((const struct _loadu_si32 *)p)->v; + return _mm_set_epi32(0, 0, 0, u); +} +#undef NODEBUG +#endif +#endif RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode; @@ -68,6 +103,300 @@ double sRGBunmap(float fc) return result; } +// Precalculated table of linear encodings of sRGB values +float gSRGBTbl[] = { + 0x0.000000p+00f, 0x1.3e4569p-12f, 0x1.3e4569p-11f, 0x1.dd681cp-11f, + 0x1.3e4569p-10f, 0x1.8dd6c2p-10f, 0x1.dd681cp-10f, 0x1.167cbbp-09f, + 0x1.3e4569p-09f, 0x1.660e15p-09f, 0x1.8dd6c2p-09f, 0x1.b6a31cp-09f, + 0x1.e1e31ep-09f, 0x1.07c38cp-08f, 0x1.1fcc2cp-08f, 0x1.390ffbp-08f, + 0x1.53936ep-08f, 0x1.6f5adfp-08f, 0x1.8c6a96p-08f, 0x1.aac6c3p-08f, + 0x1.ca7383p-08f, 0x1.eb74e3p-08f, 0x1.06e76cp-07f, 0x1.18c2a6p-07f, + 0x1.2b4e0ap-07f, 0x1.3e8b7cp-07f, 0x1.527cd7p-07f, 0x1.6723efp-07f, + 0x1.7c8293p-07f, 0x1.929a89p-07f, 0x1.a96d92p-07f, 0x1.c0fd67p-07f, + 0x1.d94bc1p-07f, 0x1.f25a47p-07f, 0x1.061553p-06f, 0x1.135f40p-06f, + 0x1.210bbap-06f, 0x1.2f1b8ep-06f, 0x1.3d8f85p-06f, 0x1.4c6868p-06f, + 0x1.5ba6fcp-06f, 0x1.6b4c05p-06f, 0x1.7b5843p-06f, 0x1.8bcc76p-06f, + 0x1.9ca95ap-06f, 0x1.adefabp-06f, 0x1.bfa021p-06f, 0x1.d1bb75p-06f, + 0x1.e4425ap-06f, 0x1.f73586p-06f, 0x1.054ad5p-05f, 0x1.0f31bbp-05f, + 0x1.194fccp-05f, 0x1.23a55fp-05f, 0x1.2e32c9p-05f, 0x1.38f860p-05f, + 0x1.43f678p-05f, 0x1.4f2d63p-05f, 0x1.5a9d76p-05f, 0x1.664701p-05f, + 0x1.722a57p-05f, 0x1.7e47c8p-05f, 0x1.8a9fa3p-05f, 0x1.973239p-05f, + 0x1.a3ffdcp-05f, 0x1.b108d3p-05f, 0x1.be4d6fp-05f, 0x1.cbcdfdp-05f, + 0x1.d98ac9p-05f, 0x1.e78420p-05f, 0x1.f5ba4cp-05f, 0x1.0216ccp-04f, + 0x1.096f28p-04f, 0x1.10e65ep-04f, 0x1.187c92p-04f, 0x1.2031eap-04f, + 0x1.280689p-04f, 0x1.2ffa93p-04f, 0x1.380e2bp-04f, 0x1.404176p-04f, + 0x1.489496p-04f, 0x1.5107aep-04f, 0x1.599ae0p-04f, 0x1.624e50p-04f, + 0x1.6b2220p-04f, 0x1.741672p-04f, 0x1.7d2b67p-04f, 0x1.866121p-04f, + 0x1.8fb7c1p-04f, 0x1.992f6ap-04f, 0x1.a2c83cp-04f, 0x1.ac8257p-04f, + 0x1.b65dddp-04f, 0x1.c05aeep-04f, 0x1.ca79aap-04f, 0x1.d4ba31p-04f, + 0x1.df1ca4p-04f, 0x1.e9a122p-04f, 0x1.f447cap-04f, 0x1.ff10bdp-04f, + 0x1.04fe0dp-03f, 0x1.0a84ffp-03f, 0x1.101d45p-03f, 0x1.15c6eep-03f, + 0x1.1b8209p-03f, 0x1.214ea6p-03f, 0x1.272cd4p-03f, 0x1.2d1ca2p-03f, + 0x1.331e1fp-03f, 0x1.393159p-03f, 0x1.3f5660p-03f, 0x1.458d43p-03f, + 0x1.4bd60fp-03f, 0x1.5230d4p-03f, 0x1.589da1p-03f, 0x1.5f1c83p-03f, + 0x1.65ad8ap-03f, 0x1.6c50c3p-03f, 0x1.73063ep-03f, 0x1.79ce07p-03f, + 0x1.80a82ep-03f, 0x1.8794c0p-03f, 0x1.8e93cbp-03f, 0x1.95a55ep-03f, + 0x1.9cc987p-03f, 0x1.a40052p-03f, 0x1.ab49cfp-03f, 0x1.b2a60ap-03f, + 0x1.ba1516p-03f, 0x1.c196f7p-03f, 0x1.c92bc1p-03f, 0x1.d0d380p-03f, + 0x1.d88e41p-03f, 0x1.e05c12p-03f, 0x1.e83d00p-03f, 0x1.f0311ap-03f, + 0x1.f8386ap-03f, 0x1.002980p-02f, 0x1.044074p-02f, 0x1.086118p-02f, + 0x1.0c8b72p-02f, 0x1.10bf88p-02f, 0x1.14fd61p-02f, 0x1.194504p-02f, + 0x1.1d9677p-02f, 0x1.21f1c0p-02f, 0x1.2656e6p-02f, 0x1.2ac5efp-02f, + 0x1.2f3ee1p-02f, 0x1.33c1c3p-02f, 0x1.384e9ap-02f, 0x1.3ce56ep-02f, + 0x1.418644p-02f, 0x1.463122p-02f, 0x1.4ae610p-02f, 0x1.4fa512p-02f, + 0x1.546e2fp-02f, 0x1.59416dp-02f, 0x1.5e1ed2p-02f, 0x1.630665p-02f, + 0x1.67f82bp-02f, 0x1.6cf42bp-02f, 0x1.71fa69p-02f, 0x1.770aedp-02f, + 0x1.7c25bdp-02f, 0x1.814addp-02f, 0x1.867a55p-02f, 0x1.8bb42ap-02f, + 0x1.90f862p-02f, 0x1.964703p-02f, 0x1.9ba012p-02f, 0x1.a10396p-02f, + 0x1.a67194p-02f, 0x1.abea12p-02f, 0x1.b16d16p-02f, 0x1.b6faa6p-02f, + 0x1.bc92c7p-02f, 0x1.c2357fp-02f, 0x1.c7e2d4p-02f, 0x1.cd9acbp-02f, + 0x1.d35d6bp-02f, 0x1.d92ab8p-02f, 0x1.df02b9p-02f, 0x1.e4e572p-02f, + 0x1.ead2ebp-02f, 0x1.f0cb27p-02f, 0x1.f6ce2dp-02f, 0x1.fcdc02p-02f, + 0x1.017a56p-01f, 0x1.048c18p-01f, 0x1.07a34bp-01f, 0x1.0abfefp-01f, + 0x1.0de209p-01f, 0x1.11099cp-01f, 0x1.1436a9p-01f, 0x1.176933p-01f, + 0x1.1aa13ep-01f, 0x1.1ddecbp-01f, 0x1.2121dfp-01f, 0x1.246a7ap-01f, + 0x1.27b8a0p-01f, 0x1.2b0c54p-01f, 0x1.2e6598p-01f, 0x1.31c46fp-01f, + 0x1.3528dcp-01f, 0x1.3892e1p-01f, 0x1.3c0280p-01f, 0x1.3f77bdp-01f, + 0x1.42f29ap-01f, 0x1.46731ap-01f, 0x1.49f93ep-01f, 0x1.4d850bp-01f, + 0x1.511682p-01f, 0x1.54ada5p-01f, 0x1.584a79p-01f, 0x1.5becfep-01f, + 0x1.5f9538p-01f, 0x1.634329p-01f, 0x1.66f6d4p-01f, 0x1.6ab03bp-01f, + 0x1.6e6f61p-01f, 0x1.723449p-01f, 0x1.75fef4p-01f, 0x1.79cf65p-01f, + 0x1.7da59fp-01f, 0x1.8181a5p-01f, 0x1.856378p-01f, 0x1.894b1cp-01f, + 0x1.8d3892p-01f, 0x1.912bdep-01f, 0x1.952501p-01f, 0x1.9923ffp-01f, + 0x1.9d28d9p-01f, 0x1.a13392p-01f, 0x1.a5442dp-01f, 0x1.a95aacp-01f, + 0x1.ad7711p-01f, 0x1.b1995fp-01f, 0x1.b5c198p-01f, 0x1.b9efbep-01f, + 0x1.be23d5p-01f, 0x1.c25ddep-01f, 0x1.c69ddcp-01f, 0x1.cae3d1p-01f, + 0x1.cf2fc0p-01f, 0x1.d381abp-01f, 0x1.d7d994p-01f, 0x1.dc377ep-01f, + 0x1.e09b6bp-01f, 0x1.e5055dp-01f, 0x1.e97557p-01f, 0x1.edeb5bp-01f, + 0x1.f2676cp-01f, 0x1.f6e98bp-01f, 0x1.fb71bcp-01f, 0x1.000000p+00f, +}; + + +float sRGBunmap(cl_uchar ic) { return gSRGBTbl[ic]; } + +#ifdef __SSE2__ +__m128 sRGBunmap(const __m128i ic) +{ + static const float recip_255 = 1.0f / 255.0f; +#ifdef __AVX2__ + __m128 fc = _mm_i32gather_ps(gSRGBTbl, _mm_cvtepu8_epi32(ic), 4); + // only RGB need to be converted for sRGBA + return _mm_insert_ps( + fc, + _mm_mul_ss(_mm_cvtsi32_ss(_mm_undefined_ps(), _mm_extract_epi8(ic, 3)), + _mm_set_ss(recip_255)), + _MM_MK_INSERTPS_NDX(0, 3, 0)); +#else + // With no gather support, we'll need to load the four components + // separately... + uint32_t pixel = _mm_cvtsi128_si32(ic); + return _mm_set_ps((float)(pixel >> 24) * recip_255, + sRGBunmap((cl_uchar)((pixel >> 16) & 0xFF)), + sRGBunmap((cl_uchar)((pixel >> 8) & 0xFF)), + sRGBunmap((cl_uchar)(pixel & 0xFF))); +#endif +} + + +#ifdef __SSE4_1__ +#define SELECT_I(cond, a, b) _mm_blendv_epi8(b, a, cond) +#define TEST_ANY_ZERO(v) !_mm_test_all_ones(v) +#define TEST_NONZERO(v) !_mm_test_all_zeros(v, v) +#define TEST_ZERO(v) _mm_test_all_zeros(v, v) +#elif defined(__SSE2__) +// n.b. "ANDNOT" is ~A & B, not A & ~B!! +#define SELECT_I(cond, a, b) \ + _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b)) +#ifdef __x86_64 +#define TEST_NONZERO(v) \ + (_mm_cvtsi128_si64(v) \ + || _mm_cvtsi128_si64(_mm_shuffle_epi32(v, _MM_SHUFFLE(0, 1, 2, 3)))) +#else +#define TEST_NONZERO(v) \ + (_mm_cvtsi128_si32(v) \ + || _mm_cvtsi128_si32( \ + _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 1)) \ + || _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 2)))) \ + || _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 3)))) +#endif +#define TEST_ZERO(v) (!TEST_NONZERO(v)) +// The int64 extraction trick won't work here... :/ +#define TEST_ANY_ZERO(v) \ + (!_mm_cvtsi128_si32(v) \ + || !_mm_cvtsi128_si32( \ + _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 1)) \ + || !_mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 2)))) \ + || !_mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 3)))) +#endif + +#ifdef __GNUC__ +#define __forceinline __attribute__((always_inline)) +#endif + +static inline __m128i __forceinline _mm_setmone_si128(void) +{ + __m128i junk = _mm_undefined_si128(); + return _mm_cmpeq_epi32(junk, junk); +} + +static inline __m128 cl_half_to_float(__m128i h) +{ +#ifdef __F16C__ + return _mm_cvtph_ps(h); +#else + // Type-punning to get direct access to underlying bits + union { + __m128 f; + __m128i i; + } f32; + + __m128i zero = _mm_setzero_si128(); + __m128i negOne = _mm_setmone_si128(); + __m128i one = _mm_srli_epi32(negOne, 31); + __m128i h_exp_mask = _mm_srli_epi16(negOne, CL_HALF_MANT_DIG); // = 0x1f + + // Extract sign bit + __m128i sign = + _mm_slli_epi32(_mm_unpacklo_epi16(_mm_srli_epi16(h, 15), zero), 31); + + // Extract FP16 exponent and mantissa + __m128i h_exp = + _mm_and_si128(_mm_srli_epi16(h, CL_HALF_MANT_DIG - 1), h_exp_mask); + __m128i h_mant = _mm_and_si128(h, _mm_srli_epi16(negOne, 6) /* 0x3ff */); + + // Remove FP16 exponent bias and convert to int32 + __m128i exp = _mm_sub_epi16( + h_exp, + _mm_srli_epi16(negOne, CL_HALF_MANT_DIG + 1) /* CL_HALF_MAX_EXP - 1 */); +#ifdef __SSE4_1__ + exp = _mm_cvtepi16_epi32(exp); +#else + exp = _mm_unpacklo_epi16(exp, _mm_cmpgt_epi16(zero, exp)); +#endif + + // Add FP32 exponent bias + __m128i f_exp = _mm_add_epi32( + exp, + _mm_srli_epi32(negOne, CL_FLT_MANT_DIG + 1) /* CL_FLT_MAX_EXP - 1 */); + + // Convert mantissa to the 32-bit form + __m128i f_mant = _mm_slli_epi32(_mm_unpacklo_epi16(h_mant, zero), + CL_FLT_MANT_DIG - CL_HALF_MANT_DIG); + + // Note that due to the way SIMD works, we can't have branches--we have to + // compute all the possible values. + + // Check for NaN / infinity + __m128i inf_mask = _mm_cmpeq_epi16(h_exp, h_exp_mask); + inf_mask = _mm_unpacklo_epi16(inf_mask, inf_mask); + __m128i mant_zero_mask = _mm_cmpeq_epi32(f_mant, zero); + // n.b. "ANDNOT" is ~A & B, not A & ~B!! + __m128i nan_mask = _mm_andnot_si128(mant_zero_mask, inf_mask); + + // NaN -> propagate mantissa and silence it + __m128i f_mant_nan = + _mm_or_si128(f_mant, _mm_slli_epi32(one, 22) /* 0x400000 */); + // Infinity -> zero mantissa + f_mant = SELECT_I(nan_mask, f_mant_nan, f_mant); + f_exp = SELECT_I(inf_mask, + _mm_srli_epi32(negOne, CL_FLT_MANT_DIG) /* 0xff */, f_exp); + + // Check for zero / denormal + __m128i exp_zero_mask = _mm_cmpeq_epi16(h_exp, zero); + exp_zero_mask = _mm_unpacklo_epi16(exp_zero_mask, exp_zero_mask); + __m128i zero_mask = _mm_and_si128(mant_zero_mask, exp_zero_mask); + // n.b. "ANDNOT" is ~A & B, not A & ~B!! + __m128i denorm_mask = _mm_andnot_si128(mant_zero_mask, exp_zero_mask); + + if (TEST_NONZERO(denorm_mask)) + { + // Denormal -> normalize it + // - Shift mantissa to make most-significant 1 implicit + // - Adjust exponent accordingly + __m128i f_mant_mask = + _mm_srli_epi32(negOne, 32 - (CL_FLT_MANT_DIG - 1)); +#if defined(__AVX512VL__) && defined(__AVX512DQ__) + // We'll probably never get here, since most CPUs that support AVX-512 + // also support F16C. n.b. No +1 yet for the implicit 1 before the radix + // point--we really do want to shift at least one place + __m128i shift = + _mm_and_si128(_mm_sub_epi32(_mm_lzcnt_epi32(f_mant), + _mm_set1_epi32(32 - CL_FLT_MANT_DIG)), + denorm_mask); + f_mant = _mm_sllv_epi32(f_mant, shift); + shift = _mm_sub_epi32(shift, _mm_and_si128(one, denorm_mask)); +#else + // No packed leading-zero count until AVX-512... gotta do this the hard + // way + __m128i shift = zero; + __m128i shift_mask = _mm_cmpgt_epi32( + f_mant, _mm_srli_epi32(negOne, 16) /* 0x0000FFFF */); + __m128i f_mant_shift = + SELECT_I(shift_mask, f_mant, _mm_slli_epi32(f_mant, 16)); + // n.b. "ANDNOT" is ~A & B, not A & ~B!! + shift = _mm_add_epi32( + shift, + _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 4) /* 16 */)); + // Starting from here, we also need to check that mant >= 0, because + // PCMPGT does a signed comparison; unsigned comparisons weren't added + // until AVX-512, which also already has a faster way to count leading + // zeroes anyway + shift_mask = _mm_or_si128( + _mm_cmplt_epi32(f_mant_shift, zero), + _mm_cmpgt_epi32(f_mant_shift, + _mm_srli_epi32(negOne, 8) /* 0x00FFFFFF */)); + f_mant_shift = + SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 8)); + shift = _mm_add_epi32( + shift, + _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 3) /* 8 */)); + shift_mask = _mm_or_si128( + _mm_cmplt_epi32(f_mant_shift, zero), + _mm_cmpgt_epi32(f_mant_shift, + _mm_srli_epi32(negOne, 4) /* 0x0FFFFFFF */)); + f_mant_shift = + SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 4)); + shift = _mm_add_epi32( + shift, + _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 2) /* 4 */)); + shift_mask = _mm_or_si128( + _mm_cmplt_epi32(f_mant_shift, zero), + _mm_cmpgt_epi32(f_mant_shift, + _mm_srli_epi32(negOne, 2) /* 0x3FFFFFFF */)); + f_mant_shift = + SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 2)); + shift = _mm_add_epi32( + shift, + _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 1) /* 2 */)); + shift_mask = _mm_or_si128( + _mm_cmplt_epi32(f_mant_shift, zero), + _mm_cmpgt_epi32(f_mant_shift, + _mm_srli_epi32(negOne, 1) /* 0x7FFFFFFF */)); + f_mant_shift = + SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 1)); + shift = _mm_add_epi32(shift, _mm_andnot_si128(shift_mask, one)); + f_mant = SELECT_I(denorm_mask, + _mm_srli_epi32(f_mant_shift, 32 - CL_FLT_MANT_DIG), + f_mant); + shift = _mm_and_si128( + _mm_sub_epi32(shift, _mm_set1_epi32(32 - CL_FLT_MANT_DIG + 1)), + denorm_mask); +#endif + f_mant = _mm_and_si128(f_mant, f_mant_mask); + f_exp = _mm_sub_epi32(f_exp, shift); + } + + // Zero -> zero exponent + // n.b. "ANDNOT" is ~A & B, not A & ~B!! + f_exp = _mm_andnot_si128(zero_mask, f_exp); + + f32.i = _mm_or_si128( + sign, _mm_or_si128(_mm_slli_epi32(f_exp, CL_FLT_MANT_DIG - 1), f_mant)); + return f32.f; +#endif +} +#endif + uint32_t get_format_type_size(const cl_image_format *format) { @@ -529,7 +858,134 @@ int random_log_in_range(int minV, int maxV, MTdata d) } +#ifdef __SSE2__ +static inline __m128i vifloorf(__m128 f) +{ +#ifdef __SSE4_1__ + return _mm_cvtps_epi32( + _mm_round_ps(f, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); +#else + // No packed rounding until SSE4... do this the old-fashioned way + unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr & ~_MM_ROUND_MASK | _MM_ROUND_DOWN); + __m128i i = _mm_cvtps_epi32(f); + _mm_setcsr(mxcsr); + return i; +#endif +} + +static inline __m128 vfloorf(__m128 f) +{ +#ifdef __SSE4_1__ + return _mm_round_ps(f, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +#else + // No packed rounding until SSE4... do this the old-fashioned way + unsigned int mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr & ~_MM_ROUND_MASK | _MM_ROUND_DOWN); + f = _mm_cvtepi32_ps(_mm_cvtps_epi32(f)); + _mm_setcsr(mxcsr); + return f; +#endif +} + +static inline __m128 frac(__m128 a) { return _mm_sub_ps(a, vfloorf(a)); } +#else +static inline float frac(float a) { return a - floorf(a); } +#endif + // Define the addressing functions +#ifdef __SSE2__ +typedef __m128i (*AddressFn)(__m128i value, __m128i maxValue); + +__m128i NoAddressFn(__m128i value, __m128i maxValue) { return value; } +__m128i RepeatAddressFn(__m128i value, __m128i maxValue) +{ + __m128i minMask = _mm_cmplt_epi32(value, _mm_setzero_si128()); + __m128i maxMask = + _mm_cmpgt_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())); + return SELECT_I(minMask, _mm_add_epi32(value, maxValue), + SELECT_I(maxMask, _mm_sub_epi32(value, maxValue), value)); +} +__m128i MirroredRepeatAddressFn(__m128i value, __m128i maxValue) +{ +#ifdef __SSE4_1__ + return _mm_max_epi32( + _mm_min_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())), + _mm_setzero_si128()); +#else + __m128i zero = _mm_setzero_si128(); + maxValue = _mm_add_epi32(maxValue, _mm_setmone_si128()); + __m128i minMask = _mm_cmplt_epi32(value, zero); + __m128i maxMask = _mm_cmpgt_epi32(value, maxValue); + return SELECT_I(minMask, zero, SELECT_I(maxMask, maxValue, value)); +#endif +} +__m128i ClampAddressFn(__m128i value, __m128i maxValue) +{ + __m128i negOne = _mm_cmpgt_epi32(maxValue, _mm_setzero_si128()); +#ifdef __SSE4_1__ + return _mm_max_epi32(_mm_min_epi32(value, maxValue), negOne); +#else + __m128i minMask = _mm_cmplt_epi32(value, negOne); + __m128i maxMask = _mm_cmpgt_epi32(value, maxValue); + return SELECT_I(minMask, negOne, SELECT_I(maxMask, maxValue, value)); +#endif +} +__m128i ClampToEdgeNearestFn(__m128i value, __m128i maxValue) +{ +#ifdef __SSE4_1__ + return _mm_max_epi32( + _mm_min_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())), + _mm_setzero_si128()); +#else + __m128i zero = _mm_setzero_si128(); + maxValue = _mm_add_epi32(maxValue, _mm_setmone_si128()); + __m128i minMask = _mm_cmplt_epi32(value, zero); + __m128i maxMask = _mm_cmpgt_epi32(value, maxValue); + return SELECT_I(minMask, zero, SELECT_I(maxMask, maxValue, value)); +#endif +} +AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn; + +// Note: normalized coords get repeated in normalized space, not unnormalized +// space! hence the special case here +__m128 RepeatNormalizedAddressFn(__m128 fValue, __m128i maxValue) +{ + return _mm_mul_ps(frac(fValue), _mm_cvtepi32_ps(maxValue)); +} + +static inline __m128 vrintf(__m128 f) +{ +#ifdef __SSE4_1__ + return _mm_round_ps(f, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + // No packed rounding until SSE4... do this the old-fashioned way + return _mm_cvtepi32_ps(_mm_cvtps_epi32(f)); +#endif +} + +static inline __m128 vfabsf(__m128 f) +{ + return _mm_andnot_ps(_mm_castsi128_ps(_mm_slli_epi32(_mm_setmone_si128(), + 31) /* 0x80000000 */), + f); +} + +__m128 MirroredRepeatNormalizedAddressFn(__m128 fValue, __m128i maxValue) +{ + // Round to nearest multiple of two. + // Note halfway values flip flop here due to rte, but they both end up + // pointing the same place at the end of the day. + __m128 s_prime = vrintf(_mm_mul_ps(fValue, _mm_set1_ps(0.5f))); + s_prime = _mm_add_ps(s_prime, s_prime); + + // Reduce to [-1, 1], Apply mirroring -> [0, 1] + s_prime = vfabsf(_mm_sub_ps(fValue, s_prime)); + + // un-normalize + return _mm_mul_ps(s_prime, _mm_cvtepi32_ps(maxValue)); +} +#else typedef int (*AddressFn)(int value, size_t maxValue); int NoAddressFn(int value, size_t maxValue) { return value; } @@ -569,7 +1025,7 @@ float RepeatNormalizedAddressFn(float fValue, size_t maxValue) { #ifndef _MSC_VER // Use original if not the VS compiler. // General computation for repeat - return (fValue - floorf(fValue)) * (float)maxValue; // Reduce to [0, 1.f] + return frac(fValue) * (float)maxValue; // Reduce to [0, 1.f] #else // Otherwise, use this instead: // Home the subtraction to a float to break up the sequence of x87 // instructions emitted by the VS compiler. @@ -591,6 +1047,7 @@ float MirroredRepeatNormalizedAddressFn(float fValue, size_t maxValue) // un-normalize return s_prime * (float)maxValue; } +#endif struct AddressingTable { @@ -1316,8 +1773,705 @@ char *generate_random_image_data(image_descriptor *imageInfo, return data; } -#define CLAMP_FLOAT(v) (fmaxf(fminf(v, 1.f), -1.f)) +#ifdef __SSE2__ +#define CLAMP_FLOAT_V(v) \ + (_mm_max_ps(_mm_min_ps(v, _mm_set1_ps(1.f)), _mm_set1_ps(-1.f))) +#define CLAMP_FLOAT(v) \ + (_mm_max_ss(_mm_min_ss(v, _mm_set_ss(1.f)), _mm_set_ss(-1.f))) +#ifdef __SSE4_1__ +#define SET_ALPHA_1(v) \ + (_mm_insert_ps(v, _mm_set_ss(1.f), _MM_MK_INSERTPS_NDX(0, 3, 0))) +#define SELECT_F(cond, a, b) _mm_blendv_ps(b, a, cond) +#define EXTRACT_I(v, i) _mm_extract_epi32(v, i) +#ifdef __x86_64 +#define EXTRACT_I64(v, i) _mm_extract_epi64(v, i) +#endif +#else +#define SET_ALPHA_1(v) (_mm_movelh_ps(v, _mm_set_ps(0.f, 0.f, 1.f, 0.f))) +// n.b. "ANDNOT" is ~A & B, not A & ~B!! +#define SELECT_F(cond, a, b) \ + _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b)) +#define EXTRACT_I(v, i) \ + _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, i))) +#ifdef __x86_64 +#define EXTRACT_I64(v, i) \ + _mm_cvtsi128_si64(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 2 * i + 1, 2 * i))) +#endif +#endif + +static __m128 read_image_pixel_float(void *imageData, + image_descriptor *imageInfo, __m128i coord, + int lod) +{ + size_t width_lod = imageInfo->width, height_lod = imageInfo->height, + depth_lod = imageInfo->depth; + size_t slice_pitch_lod = 0, row_pitch_lod = 0; + + if (imageInfo->num_mip_levels > 1) + { + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE3D: + depth_lod = + (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; + case CL_MEM_OBJECT_IMAGE2D: + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + height_lod = + (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; + default: + width_lod = + (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; + } + row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); + if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + slice_pitch_lod = row_pitch_lod; + else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D + || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) + slice_pitch_lod = row_pitch_lod * height_lod; + } + else + { + row_pitch_lod = imageInfo->rowPitch; + slice_pitch_lod = imageInfo->slicePitch; + } + + __m128i extent = _mm_set_epi32( + 0, + imageInfo->arraySize != 0 ? (int)imageInfo->arraySize : (int)depth_lod, + (int)height_lod, (int)width_lod); + __m128i zero = _mm_setzero_si128(); + __m128i minMask = _mm_cmplt_epi32(coord, zero); + __m128i maxMask = + _mm_cmpgt_epi32(coord, _mm_add_epi32(extent, _mm_setmone_si128())); + __m128i extentMask = _mm_cmpeq_epi32(extent, zero); + __m128i boundsMask = + _mm_or_si128(_mm_andnot_si128(extentMask, maxMask), minMask); + if (TEST_NONZERO(boundsMask)) + return has_alpha(imageInfo->format) ? _mm_setzero_ps() + : _mm_set_ps(1.f, 0.f, 0.f, 0.f); + + const cl_image_format *format = imageInfo->format; + + __m128 tempData; + // Predeclare a bunch of reciprocal constants so GCC doesn't use expensive + // divisions to compute them in our code + static const float recip_31 = 1.0f / 31.0f; + static const float recip_63 = 1.0f / 63.0f; + static const float recip_127 = 1.0f / 127.0f; + static const float recip_255 = 1.0f / 255.0f; + static const float recip_1023 = 1.0f / 1023.0f; + static const float recip_32767 = 1.0f / 32767.0f; + static const float recip_65535 = 1.0f / 65535.0f; + + // Advance to the right spot + char *ptr = (char *)imageData; + size_t pixelSize = get_pixel_size(format); + __m128i pitch_lod = _mm_set_epi32(0, (int)slice_pitch_lod, + (int)row_pitch_lod, (int)pixelSize); + + __m128i offsetA = _mm_mul_epu32(coord, pitch_lod); + __m128i offsetB = + _mm_mul_epu32(_mm_shuffle_epi32(coord, _MM_SHUFFLE(2, 3, 0, 1)), + _mm_shuffle_epi32(pitch_lod, _MM_SHUFFLE(2, 3, 0, 1))); +#ifdef __x86_64 + ptr += EXTRACT_I64(offsetB, 0) + EXTRACT_I64(offsetA, 1) + + EXTRACT_I64(offsetA, 0); +#else + // Using PHADDD doesn't gain us much... + ptr += + EXTRACT_I(offsetB, 0) + EXTRACT_I(offsetA, 2) + EXTRACT_I(offsetA, 0); +#endif + + // OpenCL only supports reading floats from certain formats + size_t channelCount = get_format_channel_count(format); + switch (format->image_channel_data_type) + { + case CL_SNORM_INT8: { + cl_char *dPtr = (cl_char *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = SET_ALPHA_1(CLAMP_FLOAT( + _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), + _mm_set_ss(recip_127)))); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x7F00, 1); + break; + case 3: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), + 0x7F00 | dPtr[2], 1); + break; + case 4: pixel = _mm_loadu_si32(ptr); break; + } + if (channelCount != 1) + { +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(pixel)); +#else + __m128i signMask; + signMask = _mm_cmpgt_epi8(_mm_setzero_si128(), pixel); + pixel = _mm_unpacklo_epi8(pixel, signMask); + signMask = _mm_unpacklo_epi8(signMask, signMask); + tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pixel, signMask)); +#endif + tempData = + CLAMP_FLOAT_V(_mm_mul_ps(tempData, _mm_set1_ps(recip_127))); + } + break; + } + + case CL_UNORM_INT8: { + unsigned char *dPtr = (unsigned char *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + if ((is_sRGBA_order( + imageInfo->format->image_channel_order))) + tempData = SET_ALPHA_1(_mm_set_ss(sRGBunmap(dPtr[0]))); + else + tempData = SET_ALPHA_1(_mm_mul_ss( + _mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), + _mm_set_ss(recip_255))); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0xFF00, 1); + break; + case 3: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), + 0xFF00 | dPtr[2], 1); + break; + case 4: pixel = _mm_loadu_si32(ptr); +#ifdef CL_1RGB_APPLE + if (format->image_channel_order == CL_1RGB_APPLE) +#ifdef __SSE4_1__ + pixel = _mm_insert_epi8(pixel, 0xFF, 0); +#else + pixel = + _mm_or_si128(pixel, + _mm_bsrli_si128(_mm_setmone_si128(), + 15) /* 0x000000FF */); +#endif +#endif +#ifdef CL_BGR1_APPLE + if (format->image_channel_order == CL_BGR1_APPLE) +#ifdef __SSE4_1__ + pixel = _mm_insert_epi8(pixel, 0xFF, 3); +#else + pixel = _mm_or_si128( + pixel, + _mm_bslli_si128( + _mm_bsrli_si128(_mm_setmone_si128(), 15), + 3) /* 0xFF000000 */); +#endif +#endif + break; + } + if (channelCount != 1) + { + if (is_sRGBA_order(imageInfo->format->image_channel_order)) + tempData = sRGBunmap(pixel); + else + { +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(pixel)); +#else + __m128i zero = _mm_setzero_si128(); + tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16( + _mm_unpacklo_epi8(pixel, zero), zero)); +#endif + tempData = _mm_mul_ps(tempData, _mm_set1_ps(recip_255)); + } + } + break; + } + + case CL_SIGNED_INT8: { + cl_char *dPtr = (cl_char *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = + SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100, 1); + break; + case 3: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), + 0x0100 | dPtr[2], 1); + break; + case 4: pixel = _mm_loadu_si32(ptr); break; + } + if (channelCount != 1) + { +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(pixel)); +#else + __m128i signMask; + signMask = _mm_cmpgt_epi8(_mm_setzero_si128(), pixel); + pixel = _mm_unpacklo_epi8(pixel, signMask); + signMask = _mm_unpacklo_epi8(signMask, signMask); + tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pixel, signMask)); +#endif + } + break; + } + + case CL_UNSIGNED_INT8: { + cl_uchar *dPtr = (cl_uchar *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = + SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100, 1); + break; + case 3: + pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), + 0x0100 | dPtr[2], 1); + break; + case 4: pixel = _mm_loadu_si32(ptr); break; + } + if (channelCount != 1) + { +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(pixel)); +#else + __m128i zero = _mm_setzero_si128(); + tempData = _mm_cvtepi32_ps( + _mm_unpacklo_epi16(_mm_unpacklo_epi8(pixel, zero), zero)); +#endif + } + break; + } + + case CL_SNORM_INT16: { + cl_short *dPtr = (cl_short *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = SET_ALPHA_1(CLAMP_FLOAT( + _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), + _mm_set_ss(recip_32767)))); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 0x7FFF, 3); + break; + case 3: + pixel = _mm_insert_epi16( + _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), + 0x7FFF, 3); + break; + case 4: pixel = _mm_loadu_si64(ptr); break; + } + if (channelCount != 1) + { +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(pixel)); +#else + tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16( + pixel, _mm_cmpgt_epi16(_mm_setzero_si128(), pixel))); +#endif + tempData = CLAMP_FLOAT_V( + _mm_mul_ps(tempData, _mm_set1_ps(recip_32767))); + } + break; + } + + case CL_UNORM_INT16: { + cl_ushort *dPtr = (cl_ushort *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = SET_ALPHA_1( + _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), + _mm_set_ss(recip_65535))); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 0xFFFF, 3); + break; + case 3: + pixel = _mm_insert_epi16( + _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), + 0xFFFF, 3); + break; + case 4: pixel = _mm_loadu_si64(ptr); break; + } + if (channelCount != 1) + { +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(pixel)); +#else + tempData = _mm_cvtepi32_ps( + _mm_unpacklo_epi16(pixel, _mm_setzero_si128())); +#endif + tempData = _mm_mul_ps(tempData, _mm_set1_ps(recip_65535)); + } + break; + } + + case CL_SIGNED_INT16: { + cl_short *dPtr = (cl_short *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = + SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 1, 3); + break; + case 3: + pixel = _mm_insert_epi16( + _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 1, + 3); + break; + case 4: pixel = _mm_loadu_si64(ptr); break; + } + if (channelCount != 1) +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(pixel)); +#else + tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16( + pixel, _mm_cmpgt_epi16(_mm_setzero_si128(), pixel))); +#endif + break; + } + + case CL_UNSIGNED_INT16: { + cl_ushort *dPtr = (cl_ushort *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = + SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 1, 3); + break; + case 3: + pixel = _mm_insert_epi16( + _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 1, + 3); + break; + case 4: pixel = _mm_loadu_si64(ptr); break; + } + if (channelCount != 1) +#ifdef __SSE4_1__ + tempData = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(pixel)); +#else + tempData = _mm_cvtepi32_ps( + _mm_unpacklo_epi16(pixel, _mm_setzero_si128())); +#endif + break; + } + + case CL_HALF_FLOAT: { + cl_half *dPtr = (cl_half *)ptr; + __m128i h; + switch (channelCount) + { + case 1: +#ifdef __F16C__ + tempData = SET_ALPHA_1(_mm_set_ss(_cvtsh_ss(dPtr[0]))); +#else + tempData = + SET_ALPHA_1(_mm_set_ss(cl_half_to_float(dPtr[0]))); +#endif + break; + case 2: + h = _mm_insert_epi16(_mm_loadu_si32(ptr), 0x3C00, 3); + break; + case 3: + h = _mm_insert_epi16( + _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), + 0x3C00, 3); + break; + case 4: h = _mm_loadu_si64(ptr); break; + } + + if (channelCount == 1) break; + + tempData = cl_half_to_float(h); + break; + } + + case CL_SIGNED_INT32: { + cl_int *dPtr = (cl_int *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = + SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); + break; + case 2: +#ifdef __SSE4_1__ + pixel = _mm_insert_epi32(_mm_loadu_si64(ptr), 1, 3); +#else + pixel = _mm_insert_epi16(_mm_loadu_si64(ptr), 1, 6); +#endif + break; + case 3: +#ifdef __SSE4_1__ + pixel = _mm_insert_epi32( + _mm_insert_epi32(_mm_loadu_si64(ptr), dPtr[2], 2), 1, + 3); +#else + pixel = _mm_or_si128(_mm_loadu_si64(ptr), + _mm_set_epi32(1, dPtr[2], 0, 0)); +#endif + break; + case 4: pixel = _mm_loadu_si128((__m128i_u *)ptr); break; + } + if (channelCount != 1) tempData = _mm_cvtepi32_ps(pixel); + break; + } + + case CL_UNSIGNED_INT32: { + cl_uint *dPtr = (cl_uint *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: +#ifdef __x86_64 + tempData = + SET_ALPHA_1(_mm_cvtsi64_ss(_mm_setzero_ps(), dPtr[0])); +#else + tempData = SET_ALPHA_1(_mm_set_ss((float)dPtr[0])); +#endif + break; + case 2: +#ifdef __SSE4_1__ + pixel = _mm_insert_epi32(_mm_loadu_si64(ptr), 1, 3); +#else + pixel = _mm_insert_epi16(_mm_loadu_si64(ptr), 1, 6); +#endif + break; + case 3: +#ifdef __SSE4_1__ + pixel = _mm_insert_epi32( + _mm_insert_epi32(_mm_loadu_si64(ptr), dPtr[2], 2), 1, + 3); +#else + pixel = _mm_or_si128(_mm_loadu_si64(ptr), + _mm_set_epi32(1, dPtr[2], 0, 0)); +#endif + break; + case 4: pixel = _mm_loadu_si128((__m128i_u *)ptr); break; + } + if (channelCount != 1) + { + // Unfortunately, no instruction for converting unsigned 32-bit + // integers to float exists until AVX-512; nor is there an + // instruction for converting packed 64-bit integers to float + // until same +#ifdef __AVX512VL__ + tempData = _mm_cvtepu32_ps(pixel); +#elif defined(__SSE4_1__) + // The following is based on the unoptimized output of GCC for + // scalars + __m128i negOne = _mm_setmone_si128(); + if (!_mm_testz_si128( + pixel, _mm_slli_epi32(negOne, 31) /* 0x80000000 */)) + { + __m128i one = _mm_srli_epi32(negOne, 31); // = 1; + __m128i reducedPixel = _mm_or_si128( + _mm_srli_epi32(pixel, 1), _mm_and_si128(pixel, one)); + tempData = _mm_cvtepi32_ps(reducedPixel); + tempData = _mm_add_ps(tempData, tempData); + tempData = SELECT_F(_mm_castsi128_ps(pixel), tempData, + _mm_cvtepi32_ps(pixel)); + } + else + tempData = _mm_cvtepi32_ps(pixel); +#else + // Testing contents of vectors is unwieldy without SSE4.1 + __m128i negOne = _mm_setmone_si128(); + __m128i one = _mm_srli_epi32(negOne, 31); // = 1; + __m128i reducedPixel = _mm_or_si128(_mm_srli_epi32(pixel, 1), + _mm_and_si128(pixel, one)); + tempData = _mm_cvtepi32_ps(reducedPixel); + tempData = _mm_add_ps(tempData, tempData); + __m128 tempData2 = _mm_cvtepi32_ps(pixel); + __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), tempData2); + tempData = SELECT_F(mask, tempData, tempData2); +#endif + } + break; + } + + case CL_UNORM_SHORT_565: { + cl_ushort *dPtr = (cl_ushort *)ptr; +#ifdef __AVX2__ + __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si16(dPtr)); + pixel = _mm_insert_epi32( + _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 5, 11)), 1, 3); +#else + // Shifts may as well be scalar, since before AVX2 there are no + // vector shift amounts + __m128i pixel = + _mm_set_epi32(1, dPtr[0], dPtr[0] >> 5, dPtr[0] >> 11); +#endif + pixel = _mm_and_si128(pixel, _mm_set_epi32(1, 0x1f, 0x3f, 0x1f)); + tempData = + _mm_mul_ps(_mm_cvtepi32_ps(pixel), + _mm_set_ps(1.0f, recip_31, recip_63, recip_31)); + break; + } + + case CL_UNORM_SHORT_555: { + cl_ushort *dPtr = (cl_ushort *)ptr; +#ifdef __AVX2__ + __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si16(dPtr)); + pixel = _mm_insert_epi32( + _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 5, 10)), 0x1F, 3); +#else + __m128i pixel = + _mm_set_epi32(0x1F, dPtr[0], dPtr[0] >> 5, dPtr[0] >> 10); +#endif + pixel = _mm_and_si128(pixel, _mm_set1_epi16(0x1F)); + tempData = + _mm_mul_ps(_mm_cvtepi32_ps(pixel), _mm_set1_ps(recip_31)); + break; + } + + case CL_UNORM_INT_101010: { + cl_uint *dPtr = (cl_uint *)ptr; +#ifdef __AVX2__ + __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si32(dPtr)); + pixel = _mm_insert_epi32( + _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 10, 20)), 0x3ff, 3); +#else + __m128i pixel = + _mm_set_epi32(0x3ff, dPtr[0], dPtr[0] >> 10, dPtr[0] >> 20); +#endif + pixel = _mm_and_si128(pixel, _mm_set1_epi16(0x3ff)); + tempData = + _mm_mul_ps(_mm_cvtepi32_ps(pixel), _mm_set1_ps(recip_1023)); + break; + } + + case CL_FLOAT: { + float *dPtr = (float *)ptr; + switch (channelCount) + { + case 1: tempData = SET_ALPHA_1(_mm_load_ss(dPtr)); break; + case 2: + tempData = _mm_loadl_pi(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f), + (__m64 *)ptr); + break; + case 3: + tempData = _mm_loadl_pi( + _mm_set_ps(1.0f, dPtr[2], 0.0f, 0.0f), (__m64 *)ptr); + break; + case 4: tempData = _mm_loadu_ps(dPtr); break; + } + break; + } +#ifdef CL_SFIXED14_APPLE + case CL_SFIXED14_APPLE: { + cl_ushort *dPtr = (cl_ushort *)ptr; + __m128i pixel; + switch (channelCount) + { + case 1: + tempData = SET_ALPHA_1( + _mm_mul_ss(_mm_cvtsi32_ss((int)dPtr[0] - 16384), + _mm_set_ss(0x1.0p-14f))); + break; + case 2: + pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), + (1 << 14) + 16384, 3); + break; + case 3: + pixel = _mm_insert_epi16( + _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), + (1 << 14) + 16384, 3); + break; + case 4: pixel = _mm_loadu_si64(ptr); break; + } + if (channelCount != 1) + { +#ifdef __SSE4_1__ + pixel = _mm_cvtepu16_epi32(pixel); +#else + pixel = _mm_unpacklo_epi16(pixel, _mm_setzero_si128()); +#endif + tempData = _mm_mul_ps(_mm_cvtepi32_ps(_mm_add_epi32(pixel, _mm_slli_epi32(_mm_setmone_si128(), 14) /* -16384 */, _mm_set1_ps(0x1.0p-14f)); + } + break; + } +#endif + } + + switch (format->image_channel_order) + { + case CL_R: + case CL_Rx: + case CL_RG: + case CL_RGx: + case CL_RGB: + case CL_RGBx: + case CL_sRGB: + case CL_sRGBx: + case CL_RGBA: + case CL_sRGBA: + case CL_DEPTH: + /* Already correct */ + return tempData; + case CL_A: + return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 1, 1, 1)); + case CL_RA: + return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(1, 2, 2, 0)); + case CL_ARGB: +#ifdef CL_1RGB_APPLE + case CL_1RGB_APPLE: +#endif + return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 3, 2, 1)); + case CL_ABGR: + return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 1, 2, 3)); + case CL_BGRA: + case CL_sBGRA: +#ifdef CL_BGR1_APPLE + case CL_BGR1_APPLE: +#endif + return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(3, 0, 1, 2)); + case CL_INTENSITY: + return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 0, 0, 0)); + case CL_LUMINANCE: + return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(3, 0, 0, 0)); + default: + log_error("Invalid format:"); + print_header(format, true); + break; + } + return tempData; +} + +void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, + int y, int z, float *outData, int lod) +{ + _mm_storeu_ps(outData, + read_image_pixel_float(imageData, imageInfo, + _mm_set_epi32(0, z, y, x), lod)); +} + +#else + +#define CLAMP_FLOAT(v) (fmaxf(fminf(v, 1.f), -1.f)) void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData, int lod) @@ -1391,7 +2545,7 @@ void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, { if ((is_sRGBA_order(imageInfo->format->image_channel_order)) && i < 3) // only RGB need to be converted for sRGBA - tempData[i] = (float)sRGBunmap((float)dPtr[i] / 255.0f); + tempData[i] = sRGBunmap(dPtr[i]); else tempData[i] = (float)dPtr[i] / 255.0f; } @@ -1585,6 +2739,7 @@ void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, break; } } +#endif void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData) @@ -1611,6 +2766,127 @@ bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset, { AddressFn adFn = sAddressingTable[imageSampler]; +#ifdef __SSE2__ + __m128 coord = _mm_set_ps(0.f, z, y, x); + __m128 addressOffset = + _mm_set_ps(0.f, zAddressOffset, yAddressOffset, xAddressOffset); + __m128i extent = _mm_set_epi32(0, depth, height, width); + __m128 extentf = _mm_cvtepi32_ps(extent); + __m128i ref = vifloorf(coord); + + __m128 arrayMask; + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + arrayMask = _mm_castsi128_ps( + _mm_bsrli_si128(_mm_setmone_si128(), 12)); // = 0, 0, 0, -1 + break; + case CL_MEM_OBJECT_IMAGE2D: + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + arrayMask = _mm_castsi128_ps( + _mm_bsrli_si128(_mm_setmone_si128(), 8)); // = 0, 0, -1, -1 + break; + default: + arrayMask = _mm_castsi128_ps( + _mm_bsrli_si128(_mm_setmone_si128(), 4)); // = 0, -1, -1, -1 + } + __m128 locMask = + _mm_andnot_ps(_mm_cmpeq_ps(extentf, _mm_setzero_ps()), arrayMask); + __m128 offsetMask = + _mm_andnot_ps(_mm_cmpeq_ps(addressOffset, _mm_setzero_ps()), arrayMask); + + // Handle sampler-directed coordinate normalization + clamping. Note that + // the array coordinate for image array types is expected to be + // unnormalized, and is clamped to 0..arraySize-1. + if (imageSampler->normalized_coords) + { + __m128 minMask, maxMask, temp; + switch (imageSampler->addressing_mode) + { + case CL_ADDRESS_REPEAT: + coord = SELECT_F( + locMask, RepeatNormalizedAddressFn(coord, extent), coord); + + // Add in the offset + coord = _mm_add_ps(coord, addressOffset); + // Handle wrapping + minMask = _mm_andnot_ps(_mm_cmplt_ps(coord, _mm_setzero_ps()), + offsetMask); + maxMask = + _mm_andnot_ps(_mm_cmpgt_ps(coord, extentf), offsetMask); + coord = SELECT_F( + minMask, _mm_add_ps(coord, extentf), + SELECT_F(maxMask, _mm_sub_ps(coord, extentf), coord)); + break; + + case CL_ADDRESS_MIRRORED_REPEAT: + coord = SELECT_F( + locMask, MirroredRepeatNormalizedAddressFn(coord, extent), + coord); + temp = _mm_add_ps(coord, addressOffset); + maxMask = _mm_cmpgt_ps(temp, extentf); + coord = SELECT_F( + offsetMask, + vfabsf(SELECT_F( + maxMask, _mm_sub_ps(extentf, _mm_sub_ps(temp, extentf)), + temp)), + coord); + break; + + default: + // Also, remultiply to the original coords. This simulates any + // truncation in the pass to OpenCL +#ifdef __FMA4__ + coord = + SELECT_F(arrayMask, + _mm_macc_ps(coord, extentf, addressOffset), coord); +#elif defined(__FMA__) + coord = SELECT_F(arrayMask, + _mm_fmadd_ps(coord, extentf, addressOffset), + coord); +#else + coord = SELECT_F( + arrayMask, + _mm_add_ps(_mm_mul_ps(coord, extentf), addressOffset), + coord); +#endif + break; + } + } + + // At this point, we're dealing with non-normalized coordinates. + + __m128i out = + SELECT_I(_mm_castps_si128(locMask), adFn(vifloorf(coord), extent), + _mm_cvtps_epi32(coord)); + outX = _mm_cvtsi128_si32(out); + + // 1D and 2D arrays require special care for the index coordinate: + + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + outY = static_cast( + calculate_array_index(y, (float)imageInfo->arraySize - 1.0f)); + outZ = 0; /* don't care! */ + break; + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + outY = EXTRACT_I(out, 1); + outZ = static_cast( + calculate_array_index(z, (float)imageInfo->arraySize - 1.0f)); + break; + default: + // legacy path: + outY = EXTRACT_I(out, 1); + outZ = EXTRACT_I(out, 2); + } + + out = _mm_set_epi32(0, outZ, outY, outX); + __m128i refEqual = _mm_cmpeq_epi32(ref, out); + return TEST_ANY_ZERO(refEqual); +#else float refX = floorf(x), refY = floorf(y), refZ = floorf(z); // Handle sampler-directed coordinate normalization + clamping. Note that @@ -1745,10 +3021,39 @@ bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset, } return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ); +#endif } -static float frac(float a) { return a - floorf(a); } +#ifdef __SSE2__ +static inline __m128 pixelMax(__m128 a, __m128 b) +{ + // n.b. Operands must be reversed, because if either one is NaN, the second + // operand is returned + return _mm_max_ps(vfabsf(b), vfabsf(a)); +} +static inline int IsFloatSubnormal(__m128 x) +{ + // No fpclass until AVX-512 (what took them so long?!!) + union { + __m128 f; + __m128i u; + } u; + u.f = vfabsf(x); + __m128i negOne = _mm_setmone_si128(); + return TEST_NONZERO( + _mm_cmplt_epi32(_mm_add_epi32(u.u, negOne), + _mm_srli_epi32(negOne, 9) /* 0x007fffff */)); +} + +// If containsDenorms is NULL, flush denorms to zero +// if containsDenorms is not NULL, record whether there are any denorms +static inline __m128 check_for_denorms(__m128 a, int *containsDenorms) +{ + if (NULL != containsDenorms && IsFloatSubnormal(a)) *containsDenorms = 1; + return a; +} +#else static inline void pixelMax(const float a[4], const float b[4], float *results); static inline void pixelMax(const float a[4], const float b[4], float *results) { @@ -1779,6 +3084,7 @@ static inline void check_for_denorms(float a[4], int *containsDenorms) } } } +#endif inline float calculate_array_index(float coord, float extent) { @@ -1794,6 +3100,11 @@ inline float calculate_array_index(float coord, float extent) return ret; } +#ifdef __SSE2__ +#define EXTRACT_F(v, i) \ + _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 1, i))) +#endif + /* * Utility function to unnormalized a coordinate given a particular sampler. * @@ -1802,6 +3113,124 @@ inline float calculate_array_index(float coord, float extent) * offset - an addressing offset to be added to the coordinate * extent - the max value for this coordinate (e.g. width for x) */ +#ifdef __SSE2__ + +#ifdef __AVX__ +#define TEST_NONZERO_F(v) !_mm_testz_ps(v, v) +#else +#define TEST_NONZERO_F(v) TEST_NONZERO(_mm_castps_si128(v)) +#endif + +static __m128 unnormalize_coordinate(const char *name, __m128 coord, + __m128 offset, __m128 extent, + cl_addressing_mode addressing_mode, + int verbose) +{ + __m128 zero = _mm_setzero_ps(); + __m128 ret = zero; + __m128 offsetMask = _mm_cmpneq_ps(offset, zero); + __m128 minMask, maxMask, temp; + + switch (addressing_mode) + { + case CL_ADDRESS_REPEAT: + ret = RepeatNormalizedAddressFn(coord, _mm_cvtps_epi32(extent)); + + if (verbose) + { + log_info("\tRepeat filter denormalizes %s (%f, %f, %f) to %f, " + "%f, %f\n", + name, EXTRACT_F(coord, 0), EXTRACT_F(coord, 1), + EXTRACT_F(coord, 2), EXTRACT_F(ret, 0), + EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); + } + + // Add in the offset, and handle wrapping. + ret = _mm_add_ps(ret, offset); + maxMask = _mm_and_ps(offsetMask, _mm_cmpgt_ps(ret, extent)); + minMask = _mm_and_ps(offsetMask, _mm_cmplt_ps(ret, zero)); + ret = SELECT_F(minMask, _mm_add_ps(ret, extent), + SELECT_F(maxMask, _mm_sub_ps(ret, extent), ret)); + + if (verbose && TEST_NONZERO_F(offsetMask)) + { + log_info( + "\tAddress offset of %f, %f, %f added to get %f, %f, %f\n", + EXTRACT_F(offset, 0), EXTRACT_F(offset, 1), + EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), + EXTRACT_F(ret, 2)); + } + break; + + case CL_ADDRESS_MIRRORED_REPEAT: + ret = MirroredRepeatNormalizedAddressFn(coord, + _mm_cvtps_epi32(extent)); + + if (verbose) + { + log_info("\tMirrored repeat filter denormalizes %s (%f, %f, " + "%f) to %f, %f, %f\n", + name, EXTRACT_F(coord, 0), EXTRACT_F(coord, 1), + EXTRACT_F(coord, 2), EXTRACT_F(ret, 0), + EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); + } + + temp = _mm_add_ps(ret, offset); + maxMask = _mm_cmpgt_ps(temp, extent); + ret = SELECT_F( + offsetMask, + vfabsf(SELECT_F(maxMask, + _mm_sub_ps(extent, _mm_sub_ps(temp, extent)), + temp)), + ret); + + if (verbose && TEST_NONZERO_F(offsetMask)) + { + log_info( + "\tAddress offset of %f, %f, %f added to get %f, %f, %f\n", + EXTRACT_F(offset, 0), EXTRACT_F(offset, 1), + EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), + EXTRACT_F(ret, 2)); + } + break; + + default: + + if (verbose) + { + ret = _mm_mul_ps(coord, extent); + log_info("\tFilter denormalizes %s to %f, %f, %f (<%f, %f, %f> " + "* <%f, %f, %f>)\n", + name, EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), + EXTRACT_F(ret, 2), EXTRACT_F(coord, 0), + EXTRACT_F(coord, 1), EXTRACT_F(coord, 2), + EXTRACT_F(extent, 0), EXTRACT_F(extent, 1), + EXTRACT_F(extent, 2)); + if (TEST_NONZERO_F(offsetMask)) + { + ret = _mm_add_ps(ret, offset); + log_info("\tAddress offset of %f, %f, %f added to get %f, " + "%f, %f\n", + EXTRACT_F(offset, 0), EXTRACT_F(offset, 1), + EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), + EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); + } + } + else + { +#ifdef __FMA4__ + ret = _mm_macc_ps(coord, extent, offset); +#elif defined(__FMA__) + ret = _mm_fmadd_ps(coord, extent, offset); +#else + ret = _mm_add_ps(_mm_mul_ps(coord, extent), offset); +#endif + } + } + + return ret; +} +#else static float unnormalize_coordinate(const char *name, float coord, float offset, float extent, cl_addressing_mode addressing_mode, @@ -1881,6 +3310,7 @@ static float unnormalize_coordinate(const char *name, float coord, float offset, return ret; } +#endif FloatPixel sample_image_pixel_float(void *imageData, image_descriptor *imageInfo, float x, @@ -1904,6 +3334,933 @@ FloatPixel sample_image_pixel_float(void *imageData, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms, lod); } +#ifdef __SSE2__ +FloatPixel sample_image_pixel_float_offset( + void *imageData, image_descriptor *imageInfo, float x, float y, float z, + float xAddressOffset, float yAddressOffset, float zAddressOffset, + image_sampler_data *imageSampler, float *outData, int verbose, + int *containsDenorms, int lod) +{ + AddressFn adFn = sAddressingTable[imageSampler]; + FloatPixel returnVal; + size_t width_lod = imageInfo->width, height_lod = imageInfo->height, + depth_lod = imageInfo->depth; + size_t slice_pitch_lod = 0, row_pitch_lod = 0; + + if (imageInfo->num_mip_levels > 1) + { + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE3D: + depth_lod = + (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; + case CL_MEM_OBJECT_IMAGE2D: + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + height_lod = + (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; + default: + width_lod = + (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; + } + row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); + if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + slice_pitch_lod = row_pitch_lod; + else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D + || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) + slice_pitch_lod = row_pitch_lod * height_lod; + } + else + { + slice_pitch_lod = imageInfo->slicePitch; + row_pitch_lod = imageInfo->rowPitch; + } + + if (containsDenorms) *containsDenorms = 0; + + __m128 coord, addressOffset, extentf; + __m128i extent, addressMask; + switch (imageInfo->type) + { + + // The image array types require special care: + + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + coord = _mm_set_ss(x); + addressOffset = _mm_set_ss(xAddressOffset); + extent = _mm_set_epi32(0, 1, 1, width_lod); + addressMask = + _mm_bsrli_si128(_mm_setmone_si128(), 12); // = 0, 0, 0, -1 + break; + + case CL_MEM_OBJECT_IMAGE2D: + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + coord = _mm_set_ps(0.f, 0.f, y, x); + addressOffset = + _mm_set_ps(0.f, 0.f, yAddressOffset, xAddressOffset); + extent = _mm_set_epi32(0, 1, height_lod, width_lod); + addressMask = + _mm_bsrli_si128(_mm_setmone_si128(), 8); // = 0, 0, -1, -1 + break; + + // Everybody else: + + default: + coord = _mm_set_ps(0.f, z, y, x); + addressOffset = + _mm_set_ps(0.f, zAddressOffset, yAddressOffset, xAddressOffset); + extent = _mm_set_epi32(0, depth_lod, height_lod, width_lod); + addressMask = + _mm_bsrli_si128(_mm_setmone_si128(), 4); // = 0, -1, -1, -1 + } + extentf = _mm_cvtepi32_ps(extent); + + if (imageSampler->normalized_coords) + { + // We need to unnormalize our coordinates differently depending on + // the image type, but 'x' is always processed the same way. + + const char *name = NULL; + switch (imageInfo->type) + { + + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + case CL_MEM_OBJECT_IMAGE1D_ARRAY: name = "x"; break; + + case CL_MEM_OBJECT_IMAGE2D: + case CL_MEM_OBJECT_IMAGE2D_ARRAY: name = "x, y"; break; + + default: name = "x, y, z"; + } + + coord = unnormalize_coordinate(name, coord, addressOffset, extentf, + imageSampler->addressing_mode, verbose); + } + else if (verbose) + { + + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + log_info("Starting coordinate: %f, array index %f\n", x, y); + break; + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + log_info("Starting coordinate: %f, %f, array index %f\n", x, y, + z); + break; + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + log_info("Starting coordinate: %f\b", x); + break; + case CL_MEM_OBJECT_IMAGE2D: + log_info("Starting coordinate: %f, %f\n", x, y); + break; + case CL_MEM_OBJECT_IMAGE3D: + default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z); + } + } + + // At this point, we have unnormalized coordinates. + + if (imageSampler->filter_mode == CL_FILTER_NEAREST) + { + __m128i icoord; + int arrayIndex; + + // We apply the addressing function to the now-unnormalized + // coordinates. Note that the array cases again require special + // care, per section 8.4 in the OpenCL 1.2 Specification. + + icoord = _mm_and_si128(addressMask, adFn(vifloorf(coord), extent)); + + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + arrayIndex = static_cast(calculate_array_index( + y, (float)(imageInfo->arraySize - 1))); + if (verbose) + log_info("\tArray index %f evaluates to %d\n", y, + arrayIndex); +#ifdef __SSE4_1__ + icoord = _mm_insert_epi32(icoord, arrayIndex, 1); +#else + icoord = _mm_insert_epi16( + _mm_insert_epi16(icoord, (short)arrayIndex, 2), + (short)(arrayIndex >> 16), 3); +#endif + break; + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + arrayIndex = static_cast(calculate_array_index( + z, (float)(imageInfo->arraySize - 1))); + if (verbose) + log_info("\tArray index %f evaluates to %d\n", z, + arrayIndex); +#ifdef __SSE4_1__ + icoord = _mm_insert_epi32(icoord, arrayIndex, 2); +#else + icoord = _mm_insert_epi16( + _mm_insert_epi16(icoord, (short)arrayIndex, 4), + (short)(arrayIndex >> 16), 5); +#endif + break; + default: break; + } + + if (verbose) + { + if (depth_lod) + log_info( + "\tReference integer coords calculated: { %d, %d, %d }\n", + EXTRACT_I(icoord, 0), EXTRACT_I(icoord, 1), + EXTRACT_I(icoord, 2)); + else + log_info("\tReference integer coords calculated: { %d, %d }\n", + EXTRACT_I(icoord, 0), EXTRACT_I(icoord, 1)); + } + + // SSE has an FTZ mode that will be useful here + unsigned int mxcsr = 0; + if (NULL == containsDenorms) + { + mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON); + } + __m128 outPixel = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, icoord, lod), + containsDenorms); + if (NULL == containsDenorms) _mm_setcsr(mxcsr); + _mm_storeu_ps(outData, outPixel); + _mm_storeu_ps(returnVal.p, vfabsf(outPixel)); + return returnVal; + } + else + { + // Linear filtering cases. + + // Image arrays can use 2D filtering, but require us to walk into the + // image a certain number of slices before reading. + + if (depth_lod == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY + || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + float array_index = 0; + + size_t layer_offset = 0; + + if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + array_index = + calculate_array_index(z, (float)(imageInfo->arraySize - 1)); + layer_offset = slice_pitch_lod * (size_t)array_index; + } + else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + array_index = + calculate_array_index(y, (float)(imageInfo->arraySize - 1)); + layer_offset = slice_pitch_lod * (size_t)array_index; + } + + __m128i icoord = + vifloorf(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.f, 0.5f, 0.5f))); + __m128i coord00 = _mm_and_si128(addressMask, adFn(icoord, extent)); + __m128i coord11 = _mm_and_si128( + addressMask, + adFn(_mm_sub_epi32(icoord, _mm_setmone_si128()), extent)); + + if (verbose) + { + log_info("\tActual integer coords used (i = floor(x-.5)): i0:{ " + "%d, %d } and i1:{ %d, %d }\n", + EXTRACT_I(coord00, 0), EXTRACT_I(coord00, 1), + EXTRACT_I(coord11, 0), EXTRACT_I(coord11, 1)); + log_info("\tArray coordinate is %f\n", array_index); + } + + // Walk to beginning of the 'correct' slice, if needed. + char *imgPtr = ((char *)imageData) + layer_offset; + + // flush subnormal results to zero if necessary + // SSE has an FTZ mode that will be useful here + unsigned int mxcsr; + if (NULL == containsDenorms) + { + mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON); + } + + // Make coordinate vectors for pixels 01 and 10 +#ifdef __AVX2__ + __m128i coord01 = _mm_blend_epi32(coord00, coord11, 0x01); + __m128i coord10 = _mm_blend_epi32(coord00, coord11, 0x02); +#elif defined(__SSE4_1__) + __m128i coord01 = _mm_blend_epi16(coord00, coord11, 0x03); + __m128i coord10 = _mm_blend_epi16(coord00, coord11, 0x0C); +#else + __m128i coordMask = + _mm_bsrli_si128(_mm_setmone_si128(), 8); // = 0, 0, -1, -1 + __m128i interleavedCoord = + _mm_unpacklo_epi32(coord00, coord11); // y2 y1 x2 x1 + __m128i coord01 = _mm_and_si128( + coordMask, + _mm_shuffle_epi32(interleavedCoord, _MM_SHUFFLE(3, 0, 2, 1))); + __m128i coord10 = _mm_and_si128( + coordMask, + _mm_shuffle_epi32(interleavedCoord, _MM_SHUFFLE(2, 1, 3, 0))); +#endif + + __m128 upLeft, upRight, lowLeft, lowRight; + __m128 maxUp, maxLow; + upLeft = check_for_denorms( + read_image_pixel_float(imgPtr, imageInfo, coord00, lod), + containsDenorms); + upRight = check_for_denorms( + read_image_pixel_float(imgPtr, imageInfo, coord01, lod), + containsDenorms); + maxUp = pixelMax(upLeft, upRight); + lowLeft = check_for_denorms( + read_image_pixel_float(imgPtr, imageInfo, coord10, lod), + containsDenorms); + lowRight = check_for_denorms( + read_image_pixel_float(imgPtr, imageInfo, coord11, lod), + containsDenorms); + maxLow = pixelMax(lowLeft, lowRight); + _mm_storeu_ps(returnVal.p, pixelMax(maxUp, maxLow)); + + if (verbose) + { + if (NULL == containsDenorms) + log_info("\tSampled pixels (rgba order, denorms flushed to " + "zero):\n"); + else + log_info("\tSampled pixels (rgba order):\n"); + log_info("\t\tp00: %f, %f, %f, %f\n", EXTRACT_F(upLeft, 0), + EXTRACT_F(upLeft, 1), EXTRACT_F(upLeft, 2), + EXTRACT_F(upLeft, 3)); + log_info("\t\tp01: %f, %f, %f, %f\n", EXTRACT_F(upRight, 0), + EXTRACT_F(upRight, 1), EXTRACT_F(upRight, 2), + EXTRACT_F(upRight, 3)); + log_info("\t\tp10: %f, %f, %f, %f\n", EXTRACT_F(lowLeft, 0), + EXTRACT_F(lowLeft, 1), EXTRACT_F(lowLeft, 2), + EXTRACT_F(lowLeft, 3)); + log_info("\t\tp11: %f, %f, %f, %f\n", EXTRACT_F(lowRight, 0), + EXTRACT_F(lowRight, 1), EXTRACT_F(lowRight, 2), + EXTRACT_F(lowRight, 3)); + } + + __m128 fracCoord = frac(_mm_sub_ps(coord, _mm_set1_ps(0.5f))); + if (verbose) + log_info("\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f\n", + EXTRACT_F(fracCoord, 0), EXTRACT_F(fracCoord, 1)); + +#ifdef __AVX__ + __m256d alphaBeta = _mm256_cvtps_pd(fracCoord); // x x b a + alphaBeta = _mm256_insertf128_pd( + alphaBeta, + _mm_sub_pd(_mm_set1_pd(1.0), _mm256_castpd256_pd128(alphaBeta)), + 1); // 1-b 1-a b a + // 1-b 1-a b a + // 1-a b a 1-b (2 1 0 3) + //(1-a)(1-b) (1-a)b a*b a(1-b) + // 00 10 11 01 +#ifdef __AVX2__ + __m256d weights = _mm256_mul_pd( + alphaBeta, + _mm256_permute4x64_pd(alphaBeta, _MM_SHUFFLE(2, 1, 0, 3))); + __m256d weight01 = + _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights)); + __m256d weight11 = + _mm256_permute4x64_pd(weights, _MM_SHUFFLE(1, 1, 1, 1)); + __m256d weight10 = + _mm256_permute4x64_pd(weights, _MM_SHUFFLE(2, 2, 2, 2)); + __m256d weight00 = + _mm256_permute4x64_pd(weights, _MM_SHUFFLE(3, 3, 3, 3)); +#else + // This is now more complicated... + // Swap the two halves of the vector (1 0 3 2): + // 0x01 = 0b00000001 + // ~~ ~~ + // +-----+ | + // +---*----------+ + // | + + // v v + // +-+ +-+ + // 3 2 1 0 + // ...then shuffle the elements as follows: + // 0x05 = 0b00000101 + // +----------+||| + // | +---------+|| + // | | +----+| + // | | | +---+ + // v v v v + // 3 2 1 0 1 0 3 2 + __m256d weights = _mm256_mul_pd( + alphaBeta, + _mm256_shuffle_pd( + alphaBeta, + _mm256_permute2f128_pd(alphaBeta, alphaBeta, 0x01), 0x05)); + // Duplicate the even and odd elements... + __m256d weight1001 = _mm256_movedup_pd(weights); + __m256d weight0011 = _mm256_permute_pd(weights, 0x0F); + // ...then duplicate the low and high halves of the results + __m256d weight01 = + _mm256_permute2f128_pd(_mm256_undefined_pd(), weight1001, 0x22); + __m256d weight11 = + _mm256_permute2f128_pd(_mm256_undefined_pd(), weight0011, 0x22); + __m256d weight10 = + _mm256_permute2f128_pd(_mm256_undefined_pd(), weight1001, 0x33); + __m256d weight00 = + _mm256_permute2f128_pd(_mm256_undefined_pd(), weight0011, 0x33); +#endif +#ifdef __FMA4__ + // Doing it this way instead of using a chain of FMAs avoids stalls + _mm_storeu_ps( + outData, + _mm256_cvtpd_ps(_mm256_add_pd( + _mm256_macc_pd( + _mm256_cvtps_pd(upLeft), weight00, + _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)), + _mm256_macc_pd( + _mm256_cvtps_pd(lowLeft), weight10, + _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11))))); +#elif defined(__FMA__) + _mm_storeu_ps( + outData, + _mm256_cvtpd_ps(_mm256_add_pd( + _mm256_fmadd_pd( + _mm256_cvtps_pd(upLeft), weight00, + _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)), + _mm256_fmadd_pd( + _mm256_cvtps_pd(lowLeft), weight10, + _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11))))); +#else + // No VDPPD for 256-bit vectors... :/ + _mm_storeu_ps( + outData, + _mm256_cvtpd_ps(_mm256_add_pd( + _mm256_add_pd( + _mm256_mul_pd(_mm256_cvtps_pd(upLeft), weight00), + _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)), + _mm256_add_pd( + _mm256_mul_pd(_mm256_cvtps_pd(lowLeft), weight10), + _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11))))); +#endif +#else + __m128d alphaBeta = _mm_cvtps_pd(fracCoord); // b a + __m128d invAlphaBeta = + _mm_sub_pd(_mm_set1_pd(1.0), alphaBeta); // 1-b 1-a + __m128d weights[2]; + + // <1-a b> * <1-b 1-a> = <(1-a)(1-b) (1-a)b> <00 10> + weights[0] = _mm_mul_pd( + _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(1, 0)), + invAlphaBeta); + // * = <11 01> + weights[1] = _mm_mul_pd( + alphaBeta, + _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(0, 1))); + + __m128d upLeftL = _mm_cvtps_pd(upLeft); + __m128d upLeftH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeft)); + __m128d upRightL = _mm_cvtps_pd(upRight); + __m128d upRightH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRight)); + __m128d lowLeftL = _mm_cvtps_pd(lowLeft); + __m128d lowLeftH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeft)); + __m128d lowRightL = _mm_cvtps_pd(lowRight); + __m128d lowRightH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRight)); +#ifdef __SSE4_1__ + // In the immediate bytes, the high nibble determines which + // multiplies take place--in this case, both of them--and the low + // nibble determines which lines receive the sum--bit 0 for the low + // lane, bit 1 for the high. + __m128d rg = _mm_or_pd( + _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftL, upLeftL), 0x31), + _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftL, upLeftL), + 0x32)); + __m128d ba = _mm_or_pd( + _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftH, upLeftH), 0x31), + _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftH, upLeftH), + 0x32)); + rg = _mm_add_pd( + rg, + _mm_or_pd(_mm_dp_pd(weights[1], + _mm_unpacklo_pd(upRightL, lowRightL), 0x31), + _mm_dp_pd(weights[1], + _mm_unpackhi_pd(upRightL, lowRightL), + 0x32))); + ba = _mm_add_pd( + ba, + _mm_or_pd(_mm_dp_pd(weights[1], + _mm_unpacklo_pd(upRightH, lowRightH), 0x31), + _mm_dp_pd(weights[1], + _mm_unpackhi_pd(upRightH, lowRightH), + 0x32))); +#else +#ifdef __SSE3__ + __m128d weight10 = _mm_movedup_pd(weights[0]); + __m128d weight01 = _mm_movedup_pd(weights[1]); +#else + __m128d weight10 = _mm_unpacklo_pd(weights[0], weights[0]); + __m128d weight01 = _mm_unpacklo_pd(weights[1], weights[1]); +#endif + __m128d rg = _mm_add_pd(_mm_mul_pd(weight01, upRightL), + _mm_mul_pd(weight10, lowLeftL)); + __m128d ba = _mm_add_pd(_mm_mul_pd(weight01, upRightH), + _mm_mul_pd(weight10, lowLeftH)); + __m128d weight00 = _mm_unpackhi_pd(weights[0], weights[0]); + __m128d weight11 = _mm_unpackhi_pd(weights[1], weights[1]); + rg = _mm_add_pd(rg, + _mm_add_pd(_mm_mul_pd(weight00, upLeftL), + _mm_mul_pd(weight11, lowRightL))); + ba = _mm_add_pd(ba, + _mm_add_pd(_mm_mul_pd(weight00, upLeftH), + _mm_mul_pd(weight11, lowRightH))); +#endif + _mm_storeu_ps(outData, + _mm_movelh_ps(_mm_cvtpd_ps(rg), _mm_cvtpd_ps(ba))); +#endif + if (NULL == containsDenorms) _mm_setcsr(mxcsr); + } + else + { + // 3D linear filtering + __m128i icoord = + vifloorf(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.5f, 0.5f, 0.5f))); + __m128i coord000 = _mm_and_si128(addressMask, adFn(icoord, extent)); + __m128i coord111 = _mm_and_si128( + addressMask, + adFn(_mm_sub_epi32(icoord, _mm_setmone_si128()), extent)); + + if (verbose) + log_info("\tActual integer coords used (i = floor(x-.5)): " + "i0:{%d, %d, %d} and i1:{%d, %d, %d}\n", + EXTRACT_I(coord000, 0), EXTRACT_I(coord000, 1), + EXTRACT_I(coord000, 2), EXTRACT_I(coord111, 0), + EXTRACT_I(coord111, 1), EXTRACT_I(coord111, 2)); + + // flush subnormal results to zero if necessary + // SSE has an FTZ mode that will be useful here + unsigned int mxcsr; + if (NULL == containsDenorms) + { + mxcsr = _mm_getcsr(); + _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON); + } + +#ifdef __AVX2__ + __m128i coord001 = _mm_blend_epi32(coord000, coord111, 0x01); + __m128i coord010 = _mm_blend_epi32(coord000, coord111, 0x02); + __m128i coord011 = _mm_blend_epi32(coord000, coord111, 0x03); + __m128i coord100 = _mm_blend_epi32(coord000, coord111, 0x04); + __m128i coord101 = _mm_blend_epi32(coord000, coord111, 0x05); + __m128i coord110 = _mm_blend_epi32(coord000, coord111, 0x06); +#elif defined(__SSE4_1__) + __m128i coord001 = _mm_blend_epi16(coord000, coord111, 0x03); + __m128i coord010 = _mm_blend_epi16(coord000, coord111, 0x0C); + __m128i coord011 = _mm_blend_epi16(coord000, coord111, 0x0F); + __m128i coord100 = _mm_blend_epi16(coord000, coord111, 0x30); + __m128i coord101 = _mm_blend_epi16(coord000, coord111, 0x33); + __m128i coord110 = _mm_blend_epi16(coord000, coord111, 0x3C); +#else + // XXX This is horrible without PBLEND... + __m128i negOne = _mm_setmone_si128(); + __m128i coordMask = _mm_bsrli_si128(negOne, 8); // = 0, 0, -1, -1 + __m128i coord011 = SELECT_I(coordMask, coord000, coord111); + coordMask = _mm_bsrli_si128(coordMask, 4); // = 0, 0, 0, -1 + __m128i coord001 = SELECT_I(coordMask, coord000, coord111); + coordMask = _mm_slli_epi64(coordMask, 32); // = 0, 0, -1, 0 + __m128i coord010 = SELECT_I(coordMask, coord000, coord111); + coordMask = _mm_srli_epi64(negOne, 32); // = 0, -1, 0, -1 + __m128i coord101 = SELECT_I(coordMask, coord000, coord111); + coordMask = _mm_bslli_si128(coordMask, 8); // = 0, -1, 0, 0 + __m128i coord100 = SELECT_I(coordMask, coord000, coord111); + coordMask = _mm_bslli_si128(_mm_bsrli_si128(negOne, 4), + 4); // = 0, -1, -1, 0 + __m128i coord110 = SELECT_I(coordMask, coord000, coord111); +#endif + + __m128 upLeftA, upRightA, lowLeftA, lowRightA; + __m128 upLeftB, upRightB, lowLeftB, lowRightB; + __m128 pixelMaxA, pixelMaxB, pixelMaxC; + upLeftA = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord000, lod), + containsDenorms); + upRightA = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord001, lod), + containsDenorms); + pixelMaxA = pixelMax(upLeftA, upRightA); + lowLeftA = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord010, lod), + containsDenorms); + lowRightA = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord011, lod), + containsDenorms); + pixelMaxB = pixelMax(lowLeftA, lowRightA); + pixelMaxC = pixelMax(pixelMaxA, pixelMaxB); + upLeftB = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord100, lod), + containsDenorms); + upRightB = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord101, lod), + containsDenorms); + pixelMaxA = pixelMax(upLeftB, upRightB); + lowLeftB = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord110, lod), + containsDenorms); + lowRightB = check_for_denorms( + read_image_pixel_float(imageData, imageInfo, coord111, lod), + containsDenorms); + pixelMaxB = pixelMax(lowLeftB, lowRightB); + pixelMaxA = pixelMax(pixelMaxA, pixelMaxB); + _mm_storeu_ps(returnVal.p, pixelMax(pixelMaxA, pixelMaxC)); + + if (verbose) + { + if (NULL == containsDenorms) + log_info("\tSampled pixels (rgba order, denorms flushed to " + "zero):\n"); + else + log_info("\tSampled pixels (rgba order):\n"); + log_info("\t\tp000: %f, %f, %f, %f\n", EXTRACT_F(upLeftA, 0), + EXTRACT_F(upLeftA, 1), EXTRACT_F(upLeftA, 2), + EXTRACT_F(upLeftA, 3)); + log_info("\t\tp001: %f, %f, %f, %f\n", EXTRACT_F(upRightA, 0), + EXTRACT_F(upRightA, 1), EXTRACT_F(upRightA, 2), + EXTRACT_F(upRightA, 3)); + log_info("\t\tp010: %f, %f, %f, %f\n", EXTRACT_F(lowLeftA, 0), + EXTRACT_F(lowLeftA, 1), EXTRACT_F(lowLeftA, 2), + EXTRACT_F(lowLeftA, 3)); + log_info("\t\tp011: %f, %f, %f, %f\n\n", + EXTRACT_F(lowRightA, 0), EXTRACT_F(lowRightA, 1), + EXTRACT_F(lowRightA, 2), EXTRACT_F(lowRightA, 3)); + log_info("\t\tp100: %f, %f, %f, %f\n", EXTRACT_F(upLeftB, 0), + EXTRACT_F(upLeftB, 1), EXTRACT_F(upLeftB, 2), + EXTRACT_F(upLeftB, 3)); + log_info("\t\tp101: %f, %f, %f, %f\n", EXTRACT_F(upRightB, 0), + EXTRACT_F(upRightB, 1), EXTRACT_F(upRightB, 2), + EXTRACT_F(upRightB, 3)); + log_info("\t\tp110: %f, %f, %f, %f\n", EXTRACT_F(lowLeftB, 0), + EXTRACT_F(lowLeftB, 1), EXTRACT_F(lowLeftB, 2), + EXTRACT_F(lowLeftB, 3)); + log_info("\t\tp111: %f, %f, %f, %f\n", EXTRACT_F(lowRightB, 0), + EXTRACT_F(lowRightB, 1), EXTRACT_F(lowRightB, 2), + EXTRACT_F(lowRightB, 3)); + } + + __m128 fracCoord = + frac(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.5f, 0.5f, 0.5f))); + if (verbose) + log_info("\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f, " + "frac( z - 0.5f ) = %f\n", + EXTRACT_F(fracCoord, 0), EXTRACT_F(fracCoord, 1), + EXTRACT_F(fracCoord, 2)); + +#ifdef __AVX__ + __m256d alphaBetaGamma = _mm256_cvtps_pd(fracCoord); // x g b a + __m256d invABG = _mm256_sub_pd(_mm256_set1_pd(1.0), + alphaBetaGamma); // x 1-g 1-b 1-a + __m256d alphaBeta = + _mm256_permute2f128_pd(alphaBetaGamma, invABG, 0x20); + __m256d weights[2][2][2]; + // 1-g 1-g 1-g 1-g + // 1-b 1-a b a + // 1-a b a 1-b + // a(1-b)(1-g) (1-a)b(1-g) a*b(1-g) a(1-b)(1-g) + // 000 010 011 001 + // g g g g + // 1-b 1-a b a + // 1-a b a 1-b + // (1-a)(1-b)g (1-a)b*g a*b*g a(1-b)g + // 100 110 111 101 +#ifdef __AVX2__ + __m256d invGamma = + _mm256_permute4x64_pd(invABG, _MM_SHUFFLE(2, 2, 2, 2)); + weights[1][0][0] = _mm256_mul_pd( + alphaBeta, + _mm256_permute4x64_pd(alphaBeta, _MM_SHUFFLE(2, 1, 0, 3))); + weights[0][0][0] = _mm256_mul_pd(weights[1][0][0], invGamma); + __m256d gamma = + _mm256_permute4x64_pd(alphaBetaGamma, _MM_SHUFFLE(2, 2, 2, 2)); + weights[1][0][0] = _mm256_mul_pd(weights[1][0][0], gamma); + weights[0][0][1] = + _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights[0][0][0])); + weights[0][1][1] = _mm256_permute4x64_pd(weights[0][0][0], + _MM_SHUFFLE(1, 1, 1, 1)); + weights[1][0][1] = + _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights[1][0][0])); + weights[1][1][1] = _mm256_permute4x64_pd(weights[1][0][0], + _MM_SHUFFLE(1, 1, 1, 1)); + weights[0][1][0] = _mm256_permute4x64_pd(weights[0][0][0], + _MM_SHUFFLE(2, 2, 2, 2)); + weights[0][0][0] = _mm256_permute4x64_pd(weights[0][0][0], + _MM_SHUFFLE(3, 3, 3, 3)); + weights[1][1][0] = _mm256_permute4x64_pd(weights[1][0][0], + _MM_SHUFFLE(2, 2, 2, 2)); + weights[1][0][0] = _mm256_permute4x64_pd(weights[1][0][0], + _MM_SHUFFLE(3, 3, 3, 3)); +#else + // Much like before, we must permute the elements of alphaBeta to + // get them in the form we need + __m256d invGamma = _mm256_permute2f128( + _mm256_undefined_pd(), _mm256_movedup_pd(invABG), 0x33); + weights[1][0][0] = _mm_mul_pd( + alphaBeta, + _mm256_shuffle_pd( + alphaBeta, + _mm256_permute2f128_pd(alphaBeta, alphaBeta, 0x01), 0x05)); + weights[0][0][0] = _mm_mul_pd(weights[1], invGamma); + __m256d gamma = _mm256_permute2f128( + _mm256_undefined_pd(), _mm256_movedup_pd(alphaBetaGamma), 0x33); + weights[1][0][0] = _mm_mul_pd(weights[1], gamma); + weights[0][1][0] = _mm256_movedup_pd(weights[0][0][0]); + weights[0][0][0] = _mm256_permute_pd(weights[0][0][0], 0x0F); + weights[1][1][0] = _mm256_movedup_pd(weights[1][0][0]); + weights[1][0][0] = _mm256_permute_pd(weights[1][0][0], 0x0F); + weights[0][0][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[0][1][0], 0x22); + weights[0][1][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[0][0][0], 0x22); + weights[1][0][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[1][1][0], 0x22); + weights[1][1][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[1][0][0], 0x22); + weights[0][1][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[0][1][0], 0x33); + weights[0][0][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[0][0][0], 0x33); + weights[1][1][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[1][1][0], 0x33); + weights[1][0][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), + weights[1][0][0], 0x33); +#endif +#ifdef __FMA4__ + _mm_storeu_ps( + outData, + _mm256_cvtpd_ps(_mm256_add_pd( + _mm256_add_pd( + _mm256_macc_pd(_mm256_cvtps_pd(upLeftA), + weights[0][0][0], + _mm256_mul_pd(_mm256_cvtps_pd(upRightA), + weights[0][0][1])), + _mm256_macc_pd(_mm256_cvtps_pd(lowLeftA), + weights[0][1][0], + _mm256_mul_pd(_mm256_cvtps_pd(lowRightA), + weights[0][1][1]))), + _mm256_add_pd( + _mm256_macc_pd(_mm256_cvtps_pd(upLeftB), + weights[1][0][0], + _mm256_mul_pd(_mm256_cvtps_pd(upRightB), + weights[1][0][1])), + _mm256_macc_pd(_mm256_cvtps_pd(lowLeftB), + weights[1][1][0], + _mm256_mul_pd(_mm256_cvtps_pd(lowRightB), + weights[1][1][1])))))); +#elif defined(__FMA__) + _mm_storeu_ps( + outData, + _mm256_cvtpd_ps(_mm256_add_pd( + _mm256_add_pd( + _mm256_fmadd_pd(_mm256_cvtps_pd(upLeftA), + weights[0][0][0], + _mm256_mul_pd(_mm256_cvtps_pd(upRightA), + weights[0][0][1])), + _mm256_fmadd_pd( + _mm256_cvtps_pd(lowLeftA), weights[0][1][0], + _mm256_mul_pd(_mm256_cvtps_pd(lowRightA), + weights[0][1][1]))), + _mm256_add_pd( + _mm256_fmadd_pd(_mm256_cvtps_pd(upLeftB), + weights[1][0][0], + _mm256_mul_pd(_mm256_cvtps_pd(upRightB), + weights[1][0][1])), + _mm256_fmadd_pd( + _mm256_cvtps_pd(lowLeftB), weights[1][1][0], + _mm256_mul_pd(_mm256_cvtps_pd(lowRightB), + weights[1][1][1])))))); +#else + _mm_storeu_ps( + outData, + _mm256_cvtpd_ps(_mm256_add_pd( + _mm256_add_pd( + _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(upLeftA), + weights[0][0][0]), + _mm256_mul_pd(_mm256_cvtps_pd(upRightA), + weights[0][0][1])), + _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(lowLeftA), + weights[0][1][0]), + _mm256_mul_pd(_mm256_cvtps_pd(lowRightA), + weights[0][1][1]))), + _mm256_add_pd( + _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(upLeftB), + weights[1][0][0]), + _mm256_mul_pd(_mm256_cvtps_pd(upRightB), + weights[1][0][1])), + _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(lowLeftB), + weights[1][1][0]), + _mm256_mul_pd(_mm256_cvtps_pd(lowRightB), + weights[1][1][1])))))); +#endif +#else + __m128d alphaBeta = _mm_cvtps_pd(fracCoord); // b a + __m128d invAlphaBeta = + _mm_sub_pd(_mm_set1_pd(1.0), alphaBeta); // 1-b 1-a + __m128d gamma = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), fracCoord)); +#ifdef __SSE3__ + gamma = _mm_movedup_pd(gamma); +#else + gamma = _mm_unpacklo_pd(gamma, gamma); +#endif + __m128d invGamma = _mm_sub_pd(_mm_set1_pd(1.0), gamma); + __m128d weights[4]; + + // <1-a b> * <1-b 1-a> = <(1-a)(1-b) (1-a)b> <00 10> + weights[0] = _mm_mul_pd( + _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(1, 0)), + invAlphaBeta); + // * = <11 01> + weights[1] = _mm_mul_pd( + alphaBeta, + _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(0, 1))); + weights[2] = _mm_mul_pd(weights[0], gamma); + weights[3] = _mm_mul_pd(weights[1], gamma); + weights[0] = _mm_mul_pd(weights[0], invGamma); + weights[1] = _mm_mul_pd(weights[1], invGamma); + + __m128d upLeftAL = _mm_cvtps_pd(upLeftA); + __m128d upLeftAH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeftA)); + __m128d upLeftBL = _mm_cvtps_pd(upLeftB); + __m128d upLeftBH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeftB)); + __m128d upRightAL = _mm_cvtps_pd(upRightA); + __m128d upRightAH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRightA)); + __m128d upRightBL = _mm_cvtps_pd(upRightB); + __m128d upRightBH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRightB)); + __m128d lowLeftAL = _mm_cvtps_pd(lowLeftA); + __m128d lowLeftAH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeftA)); + __m128d lowLeftBL = _mm_cvtps_pd(lowLeftB); + __m128d lowLeftBH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeftB)); + __m128d lowRightAL = _mm_cvtps_pd(lowRightA); + __m128d lowRightAH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRightA)); + __m128d lowRightBL = _mm_cvtps_pd(lowRightB); + __m128d lowRightBH = + _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRightB)); +#ifdef __SSE4_1__ + __m128d rg = _mm_or_pd( + _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftAL, upLeftAL), + 0x31), + _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftAL, upLeftAL), + 0x32)); + __m128d ba = _mm_or_pd( + _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftAH, upLeftAH), + 0x31), + _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftAH, upLeftAH), + 0x32)); + rg = _mm_add_pd( + rg, + _mm_or_pd( + _mm_dp_pd(weights[1], + _mm_unpacklo_pd(upRightAL, lowRightAL), 0x31), + _mm_dp_pd(weights[1], + _mm_unpackhi_pd(upRightAL, lowRightAL), 0x32))); + ba = _mm_add_pd( + ba, + _mm_or_pd( + _mm_dp_pd(weights[1], + _mm_unpacklo_pd(upRightAH, lowRightAH), 0x31), + _mm_dp_pd(weights[1], + _mm_unpackhi_pd(upRightAH, lowRightAH), 0x32))); + rg = _mm_add_pd( + rg, + _mm_or_pd(_mm_dp_pd(weights[2], + _mm_unpacklo_pd(lowLeftBL, upLeftBL), 0x31), + _mm_dp_pd(weights[2], + _mm_unpackhi_pd(lowLeftBL, upLeftBL), + 0x32))); + ba = _mm_add_pd( + ba, + _mm_or_pd(_mm_dp_pd(weights[2], + _mm_unpacklo_pd(lowLeftBH, upLeftBH), 0x31), + _mm_dp_pd(weights[2], + _mm_unpackhi_pd(lowLeftBH, upLeftBH), + 0x32))); + rg = _mm_add_pd( + rg, + _mm_or_pd( + _mm_dp_pd(weights[3], + _mm_unpacklo_pd(upRightBL, lowRightBL), 0x31), + _mm_dp_pd(weights[3], + _mm_unpackhi_pd(upRightBL, lowRightBL), 0x32))); + ba = _mm_add_pd( + ba, + _mm_or_pd( + _mm_dp_pd(weights[3], + _mm_unpacklo_pd(upRightBH, lowRightBH), 0x31), + _mm_dp_pd(weights[3], + _mm_unpackhi_pd(upRightBH, lowRightBH), 0x32))); +#else +#ifdef __SSE3__ + __m128d weight010 = _mm_movedup_pd(weights[0]); + __m128d weight001 = _mm_movedup_pd(weights[1]); +#else + __m128d weight010 = _mm_unpacklo_pd(weights[0], weights[0]); + __m128d weight001 = _mm_unpacklo_pd(weights[1], weights[1]); +#endif + __m128d rg = _mm_add_pd(_mm_mul_pd(weight001, upRightAL), + _mm_mul_pd(weight010, lowLeftAL)); + __m128d ba = _mm_add_pd(_mm_mul_pd(weight001, upRightAH), + _mm_mul_pd(weight010, lowLeftAH)); + __m128d weight000 = _mm_unpackhi_pd(weights[0], weights[0]); + __m128d weight011 = _mm_unpackhi_pd(weights[1], weights[1]); + rg = _mm_add_pd(rg, + _mm_add_pd(_mm_mul_pd(weight000, upLeftAL), + _mm_mul_pd(weight011, lowRightAL))); + ba = _mm_add_pd(ba, + _mm_add_pd(_mm_mul_pd(weight000, upLeftAH), + _mm_mul_pd(weight011, lowRightAH))); +#ifdef __SSE3__ + __m128d weight110 = _mm_movedup_pd(weights[2]); + __m128d weight101 = _mm_movedup_pd(weights[3]); +#else + __m128d weight110 = _mm_unpacklo_pd(weights[2], weights[2]); + __m128d weight101 = _mm_unpacklo_pd(weights[3], weights[3]); +#endif + rg = _mm_add_pd(rg, + _mm_add_pd(_mm_mul_pd(weight101, upRightBL), + _mm_mul_pd(weight110, lowLeftBL))); + ba = _mm_add_pd(rg, + _mm_add_pd(_mm_mul_pd(weight101, upRightBH), + _mm_mul_pd(weight110, lowLeftBH))); + __m128d weight100 = _mm_unpackhi_pd(weights[2], weights[2]); + __m128d weight111 = _mm_unpackhi_pd(weights[3], weights[3]); + rg = _mm_add_pd(rg, + _mm_add_pd(_mm_mul_pd(weight100, upLeftBL), + _mm_mul_pd(weight111, lowRightBL))); + ba = _mm_add_pd(ba, + _mm_add_pd(_mm_mul_pd(weight100, upLeftBH), + _mm_mul_pd(weight111, lowRightBH))); +#endif + _mm_storeu_ps(outData, + _mm_movelh_ps(_mm_cvtpd_ps(rg), _mm_cvtpd_ps(ba))); +#endif + if (NULL == containsDenorms) _mm_setcsr(mxcsr); + } + + return returnVal; + } +} +#else FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo, float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, @@ -1951,7 +4308,6 @@ FloatPixel sample_image_pixel_float_offset( // We need to unnormalize our coordinates differently depending on // the image type, but 'x' is always processed the same way. - x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod, imageSampler->addressing_mode, verbose); @@ -2325,6 +4681,7 @@ FloatPixel sample_image_pixel_float_offset( return returnVal; } } +#endif FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo, float x, float y, float z, diff --git a/test_conformance/images/kernel_read_write/test_common.cpp b/test_conformance/images/kernel_read_write/test_common.cpp index a22db195..a0734147 100644 --- a/test_conformance/images/kernel_read_write/test_common.cpp +++ b/test_conformance/images/kernel_read_write/test_common.cpp @@ -536,16 +536,20 @@ int test_read_image(cl_context context, cl_command_queue queue, } int nextLevelOffset = 0; + // Precalculate LOD dimensions for sample_image_pixel_offset() size_t width_lod = width_size, height_lod = height_size, depth_lod = depth_size; + image_descriptor lodInfo = *imageInfo; + lodInfo.num_mip_levels = 1; // Loop over all mipmap levels, if we are testing mipmapped images. for (int lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++) { - size_t image_lod_size = get_image_num_pixels( - imageInfo, width_lod, height_lod, depth_lod, imageInfo->arraySize); + size_t image_lod_size = + get_image_num_pixels(&lodInfo, lodInfo.width, lodInfo.height, + lodInfo.depth, lodInfo.arraySize); test_assert_error(0 != image_lod_size, "Invalid image size"); size_t resultValuesSize = image_lod_size * get_explicit_type_size(outputType) * 4; @@ -565,11 +569,11 @@ int test_read_image(cl_context context, cl_command_queue queue, // Init the coordinates error = InitFloatCoordsCommon( - imageInfo, imageSampler, xOffsetValues, yOffsetValues, + &lodInfo, imageSampler, xOffsetValues, yOffsetValues, zOffsetValues, q >= float_offset_count ? -offset : offset, q >= float_offset_count ? offset : -offset, q >= float_offset_count ? -offset : offset, - imageSampler->normalized_coords, d, lod); + imageSampler->normalized_coords, d, 0); test_error(error, "Unable to initialise coordinates"); error = clEnqueueWriteBuffer(queue, xOffsets, CL_TRUE, 0, @@ -601,10 +605,9 @@ int test_read_image(cl_context context, cl_command_queue queue, test_error(error, "Unable to run kernel"); // Get results - error = clEnqueueReadBuffer( - queue, results, CL_TRUE, 0, - image_lod_size * get_explicit_type_size(outputType) * 4, - resultValues, 0, NULL, NULL); + error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0, + resultValuesSize, resultValues, 0, NULL, + NULL); test_error(error, "Unable to read results from kernel"); if (gDebugTrace) log_info(" results read\n"); @@ -668,13 +671,13 @@ int test_read_image(cl_context context, cl_command_queue queue, int hasDenormals = 0; FloatPixel maxPixel = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], norm_offset_x, norm_offset_y, norm_offset_z, imageSampler, expected, 0, - &hasDenormals, lod); + &hasDenormals, 0); float err1 = ABS_ERROR(sRGBmap(resultPtr[0]), @@ -726,7 +729,7 @@ int test_read_image(cl_context context, cl_command_queue queue, maxPixel = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], @@ -734,7 +737,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, imageSampler, expected, - 0, NULL, lod); + 0, NULL, 0); err1 = ABS_ERROR( sRGBmap(resultPtr[0]), @@ -788,7 +791,7 @@ int test_read_image(cl_context context, cl_command_queue queue, int hasDenormals = 0; FloatPixel maxPixel = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], @@ -796,7 +799,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, imageSampler, expected, 0, &hasDenormals, - lod); + 0); float err1 = ABS_ERROR(sRGBmap(resultPtr[0]), @@ -827,13 +830,13 @@ int test_read_image(cl_context context, cl_command_queue queue, maxPixel = sample_image_pixel_float( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], imageSampler, expected, 0, NULL, - lod); + 0); err1 = ABS_ERROR( sRGBmap(resultPtr[0]), @@ -866,7 +869,7 @@ int test_read_image(cl_context context, cl_command_queue queue, shouldReturn |= determine_validation_error_offset< float>( - imagePtr, imageInfo, + imagePtr, &lodInfo, imageSampler, resultPtr, expected, error, xOffsetValues[j], @@ -876,11 +879,11 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, j, numTries, numClamped, - true, lod); + true, 0); log_error("Step by step:\n"); FloatPixel temp = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], @@ -889,7 +892,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_z, imageSampler, tempOut, 1 /*verbose*/, - &hasDenormals, lod); + &hasDenormals, 0); log_error( "\tulps: %2.2f, %2.2f, " "%2.2f, %2.2f (max " @@ -986,13 +989,13 @@ int test_read_image(cl_context context, cl_command_queue queue, int hasDenormals = 0; FloatPixel maxPixel = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], norm_offset_x, norm_offset_y, norm_offset_z, imageSampler, expected, 0, - &hasDenormals, lod); + &hasDenormals, 0); float err1 = ABS_ERROR(resultPtr[0], expected[0]); @@ -1051,7 +1054,7 @@ int test_read_image(cl_context context, cl_command_queue queue, maxPixel = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], @@ -1059,7 +1062,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, imageSampler, expected, - 0, NULL, lod); + 0, NULL, 0); err1 = ABS_ERROR(resultPtr[0], expected[0]); @@ -1110,7 +1113,7 @@ int test_read_image(cl_context context, cl_command_queue queue, int hasDenormals = 0; FloatPixel maxPixel = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], @@ -1118,7 +1121,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, imageSampler, expected, 0, &hasDenormals, - lod); + 0); float err1 = ABS_ERROR(resultPtr[0], expected[0]); @@ -1157,13 +1160,13 @@ int test_read_image(cl_context context, cl_command_queue queue, maxPixel = sample_image_pixel_float( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], imageSampler, expected, 0, NULL, - lod); + 0); err1 = ABS_ERROR(resultPtr[0], @@ -1196,7 +1199,7 @@ int test_read_image(cl_context context, cl_command_queue queue, shouldReturn |= determine_validation_error_offset< float>( - imagePtr, imageInfo, + imagePtr, &lodInfo, imageSampler, resultPtr, expected, error, xOffsetValues[j], @@ -1206,11 +1209,11 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, j, numTries, numClamped, - true, lod); + true, 0); log_error("Step by step:\n"); FloatPixel temp = sample_image_pixel_float_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], @@ -1219,7 +1222,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_z, imageSampler, tempOut, 1 /*verbose*/, - &hasDenormals, lod); + &hasDenormals, 0); log_error( "\tulps: %2.2f, %2.2f, " "%2.2f, %2.2f (max " @@ -1314,11 +1317,11 @@ int test_read_image(cl_context context, cl_command_queue queue, } sample_image_pixel_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], norm_offset_x, norm_offset_y, norm_offset_z, - imageSampler, expected, lod); + imageSampler, expected, 0); error = errMax( errMax(abs_diff_uint(expected[0], @@ -1380,12 +1383,12 @@ int test_read_image(cl_context context, cl_command_queue queue, sample_image_pixel_offset< unsigned int>( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], norm_offset_x, norm_offset_y, norm_offset_z, - imageSampler, expected, lod); + imageSampler, expected, 0); error = errMax( errMax( @@ -1412,7 +1415,7 @@ int test_read_image(cl_context context, cl_command_queue queue, shouldReturn |= determine_validation_error_offset< unsigned int>( - imagePtr, imageInfo, + imagePtr, &lodInfo, imageSampler, resultPtr, expected, error, xOffsetValues[j], @@ -1422,7 +1425,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, j, numTries, numClamped, - false, lod); + false, 0); } else { @@ -1497,11 +1500,11 @@ int test_read_image(cl_context context, cl_command_queue queue, } sample_image_pixel_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], norm_offset_x, norm_offset_y, norm_offset_z, - imageSampler, expected, lod); + imageSampler, expected, 0); error = errMax( errMax(abs_diff_int(expected[0], @@ -1563,12 +1566,12 @@ int test_read_image(cl_context context, cl_command_queue queue, } sample_image_pixel_offset( - imagePtr, imageInfo, + imagePtr, &lodInfo, xOffsetValues[j], yOffsetValues[j], zOffsetValues[j], norm_offset_x, norm_offset_y, norm_offset_z, - imageSampler, expected, lod); + imageSampler, expected, 0); error = errMax( errMax( @@ -1594,7 +1597,7 @@ int test_read_image(cl_context context, cl_command_queue queue, shouldReturn |= determine_validation_error_offset< int>( - imagePtr, imageInfo, + imagePtr, &lodInfo, imageSampler, resultPtr, expected, error, xOffsetValues[j], @@ -1604,7 +1607,7 @@ int test_read_image(cl_context context, cl_command_queue queue, norm_offset_y, norm_offset_z, j, numTries, numClamped, - false, lod); + false, 0); } else { @@ -1626,17 +1629,23 @@ int test_read_image(cl_context context, cl_command_queue queue, } } { - nextLevelOffset += width_lod * height_lod * depth_lod - * get_pixel_size(imageInfo->format); - width_lod = (width_lod >> 1) ? (width_lod >> 1) : 1; + nextLevelOffset += + image_lod_size * get_pixel_size(imageInfo->format); + width_lod = lodInfo.width = + (lodInfo.width >> 1) ? (lodInfo.width >> 1) : 1; if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) - { - height_lod = (height_lod >> 1) ? (height_lod >> 1) : 1; - } + height_lod = lodInfo.height = + (lodInfo.height >> 1) ? (lodInfo.height >> 1) : 1; if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) - { - depth_lod = (depth_lod >> 1) ? (depth_lod >> 1) : 1; - } + depth_lod = lodInfo.depth = + (lodInfo.depth >> 1) ? (lodInfo.depth >> 1) : 1; + lodInfo.rowPitch = + lodInfo.width * get_pixel_size(imageInfo->format); + if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + lodInfo.slicePitch = lodInfo.rowPitch; + else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D + || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) + lodInfo.slicePitch = lodInfo.rowPitch * lodInfo.height; } }