// // Copyright (c) 2017,2021 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "imageHelpers.h" #include #include #if defined(__APPLE__) #include #endif #if !defined(_WIN32) && !defined(__APPLE__) #include #endif #include #include #include #if !defined(_WIN32) #include #endif #ifdef __SSE2__ #include #if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 12) \ || (defined(__clang__) && !defined(__apple_build_version__) \ && __clang_major__ < 8) // Add missing intrinsics that aren't in ancient compilers, but are used below #ifdef __clang__ #define NODEBUG nodebug #else #define NODEBUG artificial #endif static inline __attribute__((always_inline, NODEBUG)) __m128i _mm_loadu_si16(const void *p) { struct _loadu_si16 { short v; } __attribute__((packed, may_alias)); short u = ((const struct _loadu_si16 *)p)->v; return _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, u); } static inline __attribute__((always_inline, NODEBUG)) __m128i _mm_loadu_si32(const void *p) { struct _loadu_si32 { int v; } __attribute__((packed, may_alias)); int u = ((const struct _loadu_si32 *)p)->v; return _mm_set_epi32(0, 0, 0, u); } #undef NODEBUG #endif #endif RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode; cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT; bool gTestRounding = false; double sRGBmap(float fc) { double c = (double)fc; #if !defined(_WIN32) if (std::isnan(c)) c = 0.0; #else if (_isnan(c)) c = 0.0; #endif if (c > 1.0) c = 1.0; else if (c < 0.0) c = 0.0; else if (c < 0.0031308) c = 12.92 * c; else c = (1055.0 / 1000.0) * pow(c, 5.0 / 12.0) - (55.0 / 1000.0); return c * 255.0; } double sRGBunmap(float fc) { double c = (double)fc; double result; if (c <= 0.04045) result = c / 12.92; else result = pow((c + 0.055) / 1.055, 2.4); return result; } // Precalculated table of linear encodings of sRGB values float gSRGBTbl[] = { 0x0.000000p+00f, 0x1.3e4569p-12f, 0x1.3e4569p-11f, 0x1.dd681cp-11f, 0x1.3e4569p-10f, 0x1.8dd6c2p-10f, 0x1.dd681cp-10f, 0x1.167cbbp-09f, 0x1.3e4569p-09f, 0x1.660e15p-09f, 0x1.8dd6c2p-09f, 0x1.b6a31cp-09f, 0x1.e1e31ep-09f, 0x1.07c38cp-08f, 0x1.1fcc2cp-08f, 0x1.390ffbp-08f, 0x1.53936ep-08f, 0x1.6f5adfp-08f, 0x1.8c6a96p-08f, 0x1.aac6c3p-08f, 0x1.ca7383p-08f, 0x1.eb74e3p-08f, 0x1.06e76cp-07f, 0x1.18c2a6p-07f, 0x1.2b4e0ap-07f, 0x1.3e8b7cp-07f, 0x1.527cd7p-07f, 0x1.6723efp-07f, 0x1.7c8293p-07f, 0x1.929a89p-07f, 0x1.a96d92p-07f, 0x1.c0fd67p-07f, 0x1.d94bc1p-07f, 0x1.f25a47p-07f, 0x1.061553p-06f, 0x1.135f40p-06f, 0x1.210bbap-06f, 0x1.2f1b8ep-06f, 0x1.3d8f85p-06f, 0x1.4c6868p-06f, 0x1.5ba6fcp-06f, 0x1.6b4c05p-06f, 0x1.7b5843p-06f, 0x1.8bcc76p-06f, 0x1.9ca95ap-06f, 0x1.adefabp-06f, 0x1.bfa021p-06f, 0x1.d1bb75p-06f, 0x1.e4425ap-06f, 0x1.f73586p-06f, 0x1.054ad5p-05f, 0x1.0f31bbp-05f, 0x1.194fccp-05f, 0x1.23a55fp-05f, 0x1.2e32c9p-05f, 0x1.38f860p-05f, 0x1.43f678p-05f, 0x1.4f2d63p-05f, 0x1.5a9d76p-05f, 0x1.664701p-05f, 0x1.722a57p-05f, 0x1.7e47c8p-05f, 0x1.8a9fa3p-05f, 0x1.973239p-05f, 0x1.a3ffdcp-05f, 0x1.b108d3p-05f, 0x1.be4d6fp-05f, 0x1.cbcdfdp-05f, 0x1.d98ac9p-05f, 0x1.e78420p-05f, 0x1.f5ba4cp-05f, 0x1.0216ccp-04f, 0x1.096f28p-04f, 0x1.10e65ep-04f, 0x1.187c92p-04f, 0x1.2031eap-04f, 0x1.280689p-04f, 0x1.2ffa93p-04f, 0x1.380e2bp-04f, 0x1.404176p-04f, 0x1.489496p-04f, 0x1.5107aep-04f, 0x1.599ae0p-04f, 0x1.624e50p-04f, 0x1.6b2220p-04f, 0x1.741672p-04f, 0x1.7d2b67p-04f, 0x1.866121p-04f, 0x1.8fb7c1p-04f, 0x1.992f6ap-04f, 0x1.a2c83cp-04f, 0x1.ac8257p-04f, 0x1.b65dddp-04f, 0x1.c05aeep-04f, 0x1.ca79aap-04f, 0x1.d4ba31p-04f, 0x1.df1ca4p-04f, 0x1.e9a122p-04f, 0x1.f447cap-04f, 0x1.ff10bdp-04f, 0x1.04fe0dp-03f, 0x1.0a84ffp-03f, 0x1.101d45p-03f, 0x1.15c6eep-03f, 0x1.1b8209p-03f, 0x1.214ea6p-03f, 0x1.272cd4p-03f, 0x1.2d1ca2p-03f, 0x1.331e1fp-03f, 0x1.393159p-03f, 0x1.3f5660p-03f, 0x1.458d43p-03f, 0x1.4bd60fp-03f, 0x1.5230d4p-03f, 0x1.589da1p-03f, 0x1.5f1c83p-03f, 0x1.65ad8ap-03f, 0x1.6c50c3p-03f, 0x1.73063ep-03f, 0x1.79ce07p-03f, 0x1.80a82ep-03f, 0x1.8794c0p-03f, 0x1.8e93cbp-03f, 0x1.95a55ep-03f, 0x1.9cc987p-03f, 0x1.a40052p-03f, 0x1.ab49cfp-03f, 0x1.b2a60ap-03f, 0x1.ba1516p-03f, 0x1.c196f7p-03f, 0x1.c92bc1p-03f, 0x1.d0d380p-03f, 0x1.d88e41p-03f, 0x1.e05c12p-03f, 0x1.e83d00p-03f, 0x1.f0311ap-03f, 0x1.f8386ap-03f, 0x1.002980p-02f, 0x1.044074p-02f, 0x1.086118p-02f, 0x1.0c8b72p-02f, 0x1.10bf88p-02f, 0x1.14fd61p-02f, 0x1.194504p-02f, 0x1.1d9677p-02f, 0x1.21f1c0p-02f, 0x1.2656e6p-02f, 0x1.2ac5efp-02f, 0x1.2f3ee1p-02f, 0x1.33c1c3p-02f, 0x1.384e9ap-02f, 0x1.3ce56ep-02f, 0x1.418644p-02f, 0x1.463122p-02f, 0x1.4ae610p-02f, 0x1.4fa512p-02f, 0x1.546e2fp-02f, 0x1.59416dp-02f, 0x1.5e1ed2p-02f, 0x1.630665p-02f, 0x1.67f82bp-02f, 0x1.6cf42bp-02f, 0x1.71fa69p-02f, 0x1.770aedp-02f, 0x1.7c25bdp-02f, 0x1.814addp-02f, 0x1.867a55p-02f, 0x1.8bb42ap-02f, 0x1.90f862p-02f, 0x1.964703p-02f, 0x1.9ba012p-02f, 0x1.a10396p-02f, 0x1.a67194p-02f, 0x1.abea12p-02f, 0x1.b16d16p-02f, 0x1.b6faa6p-02f, 0x1.bc92c7p-02f, 0x1.c2357fp-02f, 0x1.c7e2d4p-02f, 0x1.cd9acbp-02f, 0x1.d35d6bp-02f, 0x1.d92ab8p-02f, 0x1.df02b9p-02f, 0x1.e4e572p-02f, 0x1.ead2ebp-02f, 0x1.f0cb27p-02f, 0x1.f6ce2dp-02f, 0x1.fcdc02p-02f, 0x1.017a56p-01f, 0x1.048c18p-01f, 0x1.07a34bp-01f, 0x1.0abfefp-01f, 0x1.0de209p-01f, 0x1.11099cp-01f, 0x1.1436a9p-01f, 0x1.176933p-01f, 0x1.1aa13ep-01f, 0x1.1ddecbp-01f, 0x1.2121dfp-01f, 0x1.246a7ap-01f, 0x1.27b8a0p-01f, 0x1.2b0c54p-01f, 0x1.2e6598p-01f, 0x1.31c46fp-01f, 0x1.3528dcp-01f, 0x1.3892e1p-01f, 0x1.3c0280p-01f, 0x1.3f77bdp-01f, 0x1.42f29ap-01f, 0x1.46731ap-01f, 0x1.49f93ep-01f, 0x1.4d850bp-01f, 0x1.511682p-01f, 0x1.54ada5p-01f, 0x1.584a79p-01f, 0x1.5becfep-01f, 0x1.5f9538p-01f, 0x1.634329p-01f, 0x1.66f6d4p-01f, 0x1.6ab03bp-01f, 0x1.6e6f61p-01f, 0x1.723449p-01f, 0x1.75fef4p-01f, 0x1.79cf65p-01f, 0x1.7da59fp-01f, 0x1.8181a5p-01f, 0x1.856378p-01f, 0x1.894b1cp-01f, 0x1.8d3892p-01f, 0x1.912bdep-01f, 0x1.952501p-01f, 0x1.9923ffp-01f, 0x1.9d28d9p-01f, 0x1.a13392p-01f, 0x1.a5442dp-01f, 0x1.a95aacp-01f, 0x1.ad7711p-01f, 0x1.b1995fp-01f, 0x1.b5c198p-01f, 0x1.b9efbep-01f, 0x1.be23d5p-01f, 0x1.c25ddep-01f, 0x1.c69ddcp-01f, 0x1.cae3d1p-01f, 0x1.cf2fc0p-01f, 0x1.d381abp-01f, 0x1.d7d994p-01f, 0x1.dc377ep-01f, 0x1.e09b6bp-01f, 0x1.e5055dp-01f, 0x1.e97557p-01f, 0x1.edeb5bp-01f, 0x1.f2676cp-01f, 0x1.f6e98bp-01f, 0x1.fb71bcp-01f, 0x1.000000p+00f, }; float sRGBunmap(cl_uchar ic) { return gSRGBTbl[ic]; } #ifdef __SSE2__ __m128 sRGBunmap(const __m128i ic) { static const float recip_255 = 1.0f / 255.0f; #ifdef __AVX2__ __m128 fc = _mm_i32gather_ps(gSRGBTbl, _mm_cvtepu8_epi32(ic), 4); // only RGB need to be converted for sRGBA return _mm_insert_ps( fc, _mm_mul_ss(_mm_cvtsi32_ss(_mm_undefined_ps(), _mm_extract_epi8(ic, 3)), _mm_set_ss(recip_255)), _MM_MK_INSERTPS_NDX(0, 3, 0)); #else // With no gather support, we'll need to load the four components // separately... uint32_t pixel = _mm_cvtsi128_si32(ic); return _mm_set_ps((float)(pixel >> 24) * recip_255, sRGBunmap((cl_uchar)((pixel >> 16) & 0xFF)), sRGBunmap((cl_uchar)((pixel >> 8) & 0xFF)), sRGBunmap((cl_uchar)(pixel & 0xFF))); #endif } #ifdef __SSE4_1__ #define SELECT_I(cond, a, b) _mm_blendv_epi8(b, a, cond) #define TEST_ANY_ZERO(v) !_mm_test_all_ones(v) #define TEST_NONZERO(v) !_mm_test_all_zeros(v, v) #define TEST_ZERO(v) _mm_test_all_zeros(v, v) #elif defined(__SSE2__) // n.b. "ANDNOT" is ~A & B, not A & ~B!! #define SELECT_I(cond, a, b) \ _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b)) #ifdef __x86_64 #define TEST_NONZERO(v) \ (_mm_cvtsi128_si64(v) \ || _mm_cvtsi128_si64(_mm_shuffle_epi32(v, _MM_SHUFFLE(0, 1, 2, 3)))) #else #define TEST_NONZERO(v) \ (_mm_cvtsi128_si32(v) \ || _mm_cvtsi128_si32( \ _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 1)) \ || _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 2)))) \ || _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 3)))) #endif #define TEST_ZERO(v) (!TEST_NONZERO(v)) // The int64 extraction trick won't work here... :/ #define TEST_ANY_ZERO(v) \ (!_mm_cvtsi128_si32(v) \ || !_mm_cvtsi128_si32( \ _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 1)) \ || !_mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 2)))) \ || !_mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 3)))) #endif #ifdef __GNUC__ #define __forceinline __attribute__((always_inline)) #endif static inline __m128i __forceinline _mm_setmone_si128(void) { __m128i junk = _mm_undefined_si128(); return _mm_cmpeq_epi32(junk, junk); } static inline __m128 cl_half_to_float(__m128i h) { #ifdef __F16C__ return _mm_cvtph_ps(h); #else // Type-punning to get direct access to underlying bits union { __m128 f; __m128i i; } f32; __m128i zero = _mm_setzero_si128(); __m128i negOne = _mm_setmone_si128(); __m128i one = _mm_srli_epi32(negOne, 31); __m128i h_exp_mask = _mm_srli_epi16(negOne, CL_HALF_MANT_DIG); // = 0x1f // Extract sign bit __m128i sign = _mm_slli_epi32(_mm_unpacklo_epi16(_mm_srli_epi16(h, 15), zero), 31); // Extract FP16 exponent and mantissa __m128i h_exp = _mm_and_si128(_mm_srli_epi16(h, CL_HALF_MANT_DIG - 1), h_exp_mask); __m128i h_mant = _mm_and_si128(h, _mm_srli_epi16(negOne, 6) /* 0x3ff */); // Remove FP16 exponent bias and convert to int32 __m128i exp = _mm_sub_epi16( h_exp, _mm_srli_epi16(negOne, CL_HALF_MANT_DIG + 1) /* CL_HALF_MAX_EXP - 1 */); #ifdef __SSE4_1__ exp = _mm_cvtepi16_epi32(exp); #else exp = _mm_unpacklo_epi16(exp, _mm_cmpgt_epi16(zero, exp)); #endif // Add FP32 exponent bias __m128i f_exp = _mm_add_epi32( exp, _mm_srli_epi32(negOne, CL_FLT_MANT_DIG + 1) /* CL_FLT_MAX_EXP - 1 */); // Convert mantissa to the 32-bit form __m128i f_mant = _mm_slli_epi32(_mm_unpacklo_epi16(h_mant, zero), CL_FLT_MANT_DIG - CL_HALF_MANT_DIG); // Note that due to the way SIMD works, we can't have branches--we have to // compute all the possible values. // Check for NaN / infinity __m128i inf_mask = _mm_cmpeq_epi16(h_exp, h_exp_mask); inf_mask = _mm_unpacklo_epi16(inf_mask, inf_mask); __m128i mant_zero_mask = _mm_cmpeq_epi32(f_mant, zero); // n.b. "ANDNOT" is ~A & B, not A & ~B!! __m128i nan_mask = _mm_andnot_si128(mant_zero_mask, inf_mask); // NaN -> propagate mantissa and silence it __m128i f_mant_nan = _mm_or_si128(f_mant, _mm_slli_epi32(one, 22) /* 0x400000 */); // Infinity -> zero mantissa f_mant = SELECT_I(nan_mask, f_mant_nan, f_mant); f_exp = SELECT_I(inf_mask, _mm_srli_epi32(negOne, CL_FLT_MANT_DIG) /* 0xff */, f_exp); // Check for zero / denormal __m128i exp_zero_mask = _mm_cmpeq_epi16(h_exp, zero); exp_zero_mask = _mm_unpacklo_epi16(exp_zero_mask, exp_zero_mask); __m128i zero_mask = _mm_and_si128(mant_zero_mask, exp_zero_mask); // n.b. "ANDNOT" is ~A & B, not A & ~B!! __m128i denorm_mask = _mm_andnot_si128(mant_zero_mask, exp_zero_mask); if (TEST_NONZERO(denorm_mask)) { // Denormal -> normalize it // - Shift mantissa to make most-significant 1 implicit // - Adjust exponent accordingly __m128i f_mant_mask = _mm_srli_epi32(negOne, 32 - (CL_FLT_MANT_DIG - 1)); #if defined(__AVX512VL__) && defined(__AVX512DQ__) // We'll probably never get here, since most CPUs that support AVX-512 // also support F16C. n.b. No +1 yet for the implicit 1 before the radix // point--we really do want to shift at least one place __m128i shift = _mm_and_si128(_mm_sub_epi32(_mm_lzcnt_epi32(f_mant), _mm_set1_epi32(32 - CL_FLT_MANT_DIG)), denorm_mask); f_mant = _mm_sllv_epi32(f_mant, shift); shift = _mm_sub_epi32(shift, _mm_and_si128(one, denorm_mask)); #else // No packed leading-zero count until AVX-512... gotta do this the hard // way __m128i shift = zero; __m128i shift_mask = _mm_cmpgt_epi32( f_mant, _mm_srli_epi32(negOne, 16) /* 0x0000FFFF */); __m128i f_mant_shift = SELECT_I(shift_mask, f_mant, _mm_slli_epi32(f_mant, 16)); // n.b. "ANDNOT" is ~A & B, not A & ~B!! shift = _mm_add_epi32( shift, _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 4) /* 16 */)); // Starting from here, we also need to check that mant >= 0, because // PCMPGT does a signed comparison; unsigned comparisons weren't added // until AVX-512, which also already has a faster way to count leading // zeroes anyway shift_mask = _mm_or_si128( _mm_cmplt_epi32(f_mant_shift, zero), _mm_cmpgt_epi32(f_mant_shift, _mm_srli_epi32(negOne, 8) /* 0x00FFFFFF */)); f_mant_shift = SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 8)); shift = _mm_add_epi32( shift, _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 3) /* 8 */)); shift_mask = _mm_or_si128( _mm_cmplt_epi32(f_mant_shift, zero), _mm_cmpgt_epi32(f_mant_shift, _mm_srli_epi32(negOne, 4) /* 0x0FFFFFFF */)); f_mant_shift = SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 4)); shift = _mm_add_epi32( shift, _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 2) /* 4 */)); shift_mask = _mm_or_si128( _mm_cmplt_epi32(f_mant_shift, zero), _mm_cmpgt_epi32(f_mant_shift, _mm_srli_epi32(negOne, 2) /* 0x3FFFFFFF */)); f_mant_shift = SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 2)); shift = _mm_add_epi32( shift, _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 1) /* 2 */)); shift_mask = _mm_or_si128( _mm_cmplt_epi32(f_mant_shift, zero), _mm_cmpgt_epi32(f_mant_shift, _mm_srli_epi32(negOne, 1) /* 0x7FFFFFFF */)); f_mant_shift = SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 1)); shift = _mm_add_epi32(shift, _mm_andnot_si128(shift_mask, one)); f_mant = SELECT_I(denorm_mask, _mm_srli_epi32(f_mant_shift, 32 - CL_FLT_MANT_DIG), f_mant); shift = _mm_and_si128( _mm_sub_epi32(shift, _mm_set1_epi32(32 - CL_FLT_MANT_DIG + 1)), denorm_mask); #endif f_mant = _mm_and_si128(f_mant, f_mant_mask); f_exp = _mm_sub_epi32(f_exp, shift); } // Zero -> zero exponent // n.b. "ANDNOT" is ~A & B, not A & ~B!! f_exp = _mm_andnot_si128(zero_mask, f_exp); f32.i = _mm_or_si128( sign, _mm_or_si128(_mm_slli_epi32(f_exp, CL_FLT_MANT_DIG - 1), f_mant)); return f32.f; #endif } #endif uint32_t get_format_type_size(const cl_image_format *format) { return get_channel_data_type_size(format->image_channel_data_type); } uint32_t get_channel_data_type_size(cl_channel_type channelType) { switch (channelType) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: return 1; case CL_SNORM_INT16: case CL_UNORM_INT16: case CL_SIGNED_INT16: case CL_UNSIGNED_INT16: case CL_HALF_FLOAT: #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: #endif return sizeof(cl_short); case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: return sizeof(cl_int); case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: #ifdef OBSOLETE_FORAMT case CL_UNORM_SHORT_565_REV: case CL_UNORM_SHORT_555_REV: #endif return 2; #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_8888: case CL_UNORM_INT_8888_REV: return 4; #endif case CL_UNORM_INT_101010: #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_101010_REV: #endif return 4; case CL_FLOAT: return sizeof(cl_float); default: return 0; } } uint32_t get_format_channel_count(const cl_image_format *format) { return get_channel_order_channel_count(format->image_channel_order); } uint32_t get_channel_order_channel_count(cl_channel_order order) { switch (order) { case CL_R: case CL_A: case CL_Rx: case CL_INTENSITY: case CL_LUMINANCE: case CL_DEPTH: case CL_DEPTH_STENCIL: return 1; case CL_RG: case CL_RA: case CL_RGx: return 2; case CL_RGB: case CL_RGBx: case CL_sRGB: case CL_sRGBx: return 3; case CL_RGBA: case CL_ARGB: case CL_BGRA: case CL_sRGBA: case CL_sBGRA: case CL_ABGR: #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: #endif #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: #endif #ifdef CL_ABGR_APPLE case CL_ABGR_APPLE: #endif return 4; default: log_error("%s does not support 0x%x\n", __FUNCTION__, order); return 0; } } cl_channel_type get_channel_type_from_name(const char *name) { struct { cl_channel_type type; const char *name; } typeNames[] = { { CL_SNORM_INT8, "CL_SNORM_INT8" }, { CL_SNORM_INT16, "CL_SNORM_INT16" }, { CL_UNORM_INT8, "CL_UNORM_INT8" }, { CL_UNORM_INT16, "CL_UNORM_INT16" }, { CL_UNORM_INT24, "CL_UNORM_INT24" }, { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" }, { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" }, { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" }, { CL_SIGNED_INT8, "CL_SIGNED_INT8" }, { CL_SIGNED_INT16, "CL_SIGNED_INT16" }, { CL_SIGNED_INT32, "CL_SIGNED_INT32" }, { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" }, { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" }, { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" }, { CL_HALF_FLOAT, "CL_HALF_FLOAT" }, { CL_FLOAT, "CL_FLOAT" }, #ifdef CL_SFIXED14_APPLE { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" } #endif }; for (size_t i = 0; i < sizeof(typeNames) / sizeof(typeNames[0]); i++) { if (strcmp(typeNames[i].name, name) == 0 || strcmp(typeNames[i].name + 3, name) == 0) return typeNames[i].type; } return (cl_channel_type)-1; } cl_channel_order get_channel_order_from_name(const char *name) { const struct { cl_channel_order order; const char *name; } orderNames[] = { { CL_R, "CL_R" }, { CL_A, "CL_A" }, { CL_Rx, "CL_Rx" }, { CL_RG, "CL_RG" }, { CL_RA, "CL_RA" }, { CL_RGx, "CL_RGx" }, { CL_RGB, "CL_RGB" }, { CL_RGBx, "CL_RGBx" }, { CL_RGBA, "CL_RGBA" }, { CL_BGRA, "CL_BGRA" }, { CL_ARGB, "CL_ARGB" }, { CL_INTENSITY, "CL_INTENSITY" }, { CL_LUMINANCE, "CL_LUMINANCE" }, { CL_DEPTH, "CL_DEPTH" }, { CL_DEPTH_STENCIL, "CL_DEPTH_STENCIL" }, { CL_sRGB, "CL_sRGB" }, { CL_sRGBx, "CL_sRGBx" }, { CL_sRGBA, "CL_sRGBA" }, { CL_sBGRA, "CL_sBGRA" }, { CL_ABGR, "CL_ABGR" }, #ifdef CL_1RGB_APPLE { CL_1RGB_APPLE, "CL_1RGB_APPLE" }, #endif #ifdef CL_BGR1_APPLE { CL_BGR1_APPLE, "CL_BGR1_APPLE" }, #endif }; for (size_t i = 0; i < sizeof(orderNames) / sizeof(orderNames[0]); i++) { if (strcmp(orderNames[i].name, name) == 0 || strcmp(orderNames[i].name + 3, name) == 0) return orderNames[i].order; } return (cl_channel_order)-1; } int is_format_signed(const cl_image_format *format) { switch (format->image_channel_data_type) { case CL_SNORM_INT8: case CL_SIGNED_INT8: case CL_SNORM_INT16: case CL_SIGNED_INT16: case CL_SIGNED_INT32: case CL_HALF_FLOAT: case CL_FLOAT: #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: #endif return 1; default: return 0; } } uint32_t get_pixel_size(const cl_image_format *format) { switch (format->image_channel_data_type) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: return get_format_channel_count(format); case CL_SNORM_INT16: case CL_UNORM_INT16: case CL_SIGNED_INT16: case CL_UNSIGNED_INT16: case CL_HALF_FLOAT: #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: #endif return get_format_channel_count(format) * sizeof(cl_ushort); case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: return get_format_channel_count(format) * sizeof(cl_int); case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: #ifdef OBSOLETE_FORAMT case CL_UNORM_SHORT_565_REV: case CL_UNORM_SHORT_555_REV: #endif return 2; #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_8888: case CL_UNORM_INT_8888_REV: return 4; #endif case CL_UNORM_INT_101010: #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_101010_REV: #endif return 4; case CL_FLOAT: return get_format_channel_count(format) * sizeof(cl_float); default: return 0; } } uint32_t next_power_of_two(uint32_t v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } uint32_t get_pixel_alignment(const cl_image_format *format) { return next_power_of_two(get_pixel_size(format)); } int get_8_bit_image_format(cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat) { cl_image_format formatList[128]; unsigned int outFormatCount, i; int error; /* Make sure each image format is supported */ if ((error = clGetSupportedImageFormats(context, flags, objType, 128, formatList, &outFormatCount))) return error; /* Look for one that is an 8-bit format */ for (i = 0; i < outFormatCount; i++) { if (formatList[i].image_channel_data_type == CL_SNORM_INT8 || formatList[i].image_channel_data_type == CL_UNORM_INT8 || formatList[i].image_channel_data_type == CL_SIGNED_INT8 || formatList[i].image_channel_data_type == CL_UNSIGNED_INT8) { if (!channelCount || (channelCount && (get_format_channel_count(&formatList[i]) == channelCount))) { *outFormat = formatList[i]; return 0; } } } return -1; } int get_32_bit_image_format(cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat) { cl_image_format formatList[128]; unsigned int outFormatCount, i; int error; /* Make sure each image format is supported */ if ((error = clGetSupportedImageFormats(context, flags, objType, 128, formatList, &outFormatCount))) return error; /* Look for one that is an 8-bit format */ for (i = 0; i < outFormatCount; i++) { if (formatList[i].image_channel_data_type == CL_UNORM_INT_101010 || formatList[i].image_channel_data_type == CL_FLOAT || formatList[i].image_channel_data_type == CL_SIGNED_INT32 || formatList[i].image_channel_data_type == CL_UNSIGNED_INT32) { if (!channelCount || (channelCount && (get_format_channel_count(&formatList[i]) == channelCount))) { *outFormat = formatList[i]; return 0; } } } return -1; } void print_first_pixel_difference_error(size_t where, const char *sourcePixel, const char *destPixel, image_descriptor *imageInfo, size_t y, size_t thirdDim) { size_t pixel_size = get_pixel_size(imageInfo->format); log_error("ERROR: Scanline %d did not verify for image size %d,%d,%d " "pitch %d (extra %d bytes)\n", (int)y, (int)imageInfo->width, (int)imageInfo->height, (int)thirdDim, (int)imageInfo->rowPitch, (int)imageInfo->rowPitch - (int)imageInfo->width * (int)pixel_size); log_error("Failed at column: %zu ", where); switch (pixel_size) { case 1: log_error("*0x%2.2x vs. 0x%2.2x\n", ((cl_uchar *)sourcePixel)[0], ((cl_uchar *)destPixel)[0]); break; case 2: log_error("*0x%4.4x vs. 0x%4.4x\n", ((cl_ushort *)sourcePixel)[0], ((cl_ushort *)destPixel)[0]); break; case 3: log_error("*{0x%2.2x, 0x%2.2x, 0x%2.2x} vs. " "{0x%2.2x, 0x%2.2x, 0x%2.2x}\n", ((cl_uchar *)sourcePixel)[0], ((cl_uchar *)sourcePixel)[1], ((cl_uchar *)sourcePixel)[2], ((cl_uchar *)destPixel)[0], ((cl_uchar *)destPixel)[1], ((cl_uchar *)destPixel)[2]); break; case 4: log_error("*0x%8.8x vs. 0x%8.8x\n", ((cl_uint *)sourcePixel)[0], ((cl_uint *)destPixel)[0]); break; case 6: log_error( "*{0x%4.4x, 0x%4.4x, 0x%4.4x} vs. " "{0x%4.4x, 0x%4.4x, 0x%4.4x}\n", ((cl_ushort *)sourcePixel)[0], ((cl_ushort *)sourcePixel)[1], ((cl_ushort *)sourcePixel)[2], ((cl_ushort *)destPixel)[0], ((cl_ushort *)destPixel)[1], ((cl_ushort *)destPixel)[2]); break; case 8: log_error("*0x%16.16" PRIx64 " vs. 0x%16.16" PRIx64 "\n", ((cl_ulong *)sourcePixel)[0], ((cl_ulong *)destPixel)[0]); break; case 12: log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x} vs. " "{0x%8.8x, 0x%8.8x, 0x%8.8x}\n", ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1], ((cl_uint *)sourcePixel)[2], ((cl_uint *)destPixel)[0], ((cl_uint *)destPixel)[1], ((cl_uint *)destPixel)[2]); break; case 16: log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x} vs. " "{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x}\n", ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1], ((cl_uint *)sourcePixel)[2], ((cl_uint *)sourcePixel)[3], ((cl_uint *)destPixel)[0], ((cl_uint *)destPixel)[1], ((cl_uint *)destPixel)[2], ((cl_uint *)destPixel)[3]); break; default: log_error("Don't know how to print pixel size of %zu\n", pixel_size); break; } } size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr, const char *bPtr) { size_t pixel_size = get_pixel_size(imageInfo->format); size_t column; for (column = 0; column < imageInfo->width; column++) { switch (imageInfo->format->image_channel_data_type) { // If the data type is 101010, then ignore bits 31 and 32 when // comparing the row case CL_UNORM_INT_101010: { cl_uint aPixel = *(cl_uint *)aPtr; cl_uint bPixel = *(cl_uint *)bPtr; if ((aPixel & 0x3fffffff) != (bPixel & 0x3fffffff)) return column; } break; // If the data type is 555, ignore bit 15 when comparing the row case CL_UNORM_SHORT_555: { cl_ushort aPixel = *(cl_ushort *)aPtr; cl_ushort bPixel = *(cl_ushort *)bPtr; if ((aPixel & 0x7fff) != (bPixel & 0x7fff)) return column; } break; default: if (memcmp(aPtr, bPtr, pixel_size) != 0) return column; break; } aPtr += pixel_size; bPtr += pixel_size; } // If we didn't find a difference, return the width of the image return column; } int random_log_in_range(int minV, int maxV, MTdata d) { double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1); int iv = (int)((float)(maxV - minV) * v); return iv + minV; } #ifdef __SSE2__ static inline __m128i vifloorf(__m128 f) { #ifdef __SSE4_1__ return _mm_cvtps_epi32( _mm_round_ps(f, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); #else // No packed rounding until SSE4... do this the old-fashioned way unsigned int mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr & ~_MM_ROUND_MASK | _MM_ROUND_DOWN); __m128i i = _mm_cvtps_epi32(f); _mm_setcsr(mxcsr); return i; #endif } static inline __m128 vfloorf(__m128 f) { #ifdef __SSE4_1__ return _mm_round_ps(f, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); #else // No packed rounding until SSE4... do this the old-fashioned way unsigned int mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr & ~_MM_ROUND_MASK | _MM_ROUND_DOWN); f = _mm_cvtepi32_ps(_mm_cvtps_epi32(f)); _mm_setcsr(mxcsr); return f; #endif } static inline __m128 frac(__m128 a) { return _mm_sub_ps(a, vfloorf(a)); } #else static inline float frac(float a) { return a - floorf(a); } #endif // Define the addressing functions #ifdef __SSE2__ typedef __m128i (*AddressFn)(__m128i value, __m128i maxValue); __m128i NoAddressFn(__m128i value, __m128i maxValue) { return value; } __m128i RepeatAddressFn(__m128i value, __m128i maxValue) { __m128i minMask = _mm_cmplt_epi32(value, _mm_setzero_si128()); __m128i maxMask = _mm_cmpgt_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())); return SELECT_I(minMask, _mm_add_epi32(value, maxValue), SELECT_I(maxMask, _mm_sub_epi32(value, maxValue), value)); } __m128i MirroredRepeatAddressFn(__m128i value, __m128i maxValue) { #ifdef __SSE4_1__ return _mm_max_epi32( _mm_min_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())), _mm_setzero_si128()); #else __m128i zero = _mm_setzero_si128(); maxValue = _mm_add_epi32(maxValue, _mm_setmone_si128()); __m128i minMask = _mm_cmplt_epi32(value, zero); __m128i maxMask = _mm_cmpgt_epi32(value, maxValue); return SELECT_I(minMask, zero, SELECT_I(maxMask, maxValue, value)); #endif } __m128i ClampAddressFn(__m128i value, __m128i maxValue) { __m128i negOne = _mm_cmpgt_epi32(maxValue, _mm_setzero_si128()); #ifdef __SSE4_1__ return _mm_max_epi32(_mm_min_epi32(value, maxValue), negOne); #else __m128i minMask = _mm_cmplt_epi32(value, negOne); __m128i maxMask = _mm_cmpgt_epi32(value, maxValue); return SELECT_I(minMask, negOne, SELECT_I(maxMask, maxValue, value)); #endif } __m128i ClampToEdgeNearestFn(__m128i value, __m128i maxValue) { #ifdef __SSE4_1__ return _mm_max_epi32( _mm_min_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())), _mm_setzero_si128()); #else __m128i zero = _mm_setzero_si128(); maxValue = _mm_add_epi32(maxValue, _mm_setmone_si128()); __m128i minMask = _mm_cmplt_epi32(value, zero); __m128i maxMask = _mm_cmpgt_epi32(value, maxValue); return SELECT_I(minMask, zero, SELECT_I(maxMask, maxValue, value)); #endif } AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn; // Note: normalized coords get repeated in normalized space, not unnormalized // space! hence the special case here __m128 RepeatNormalizedAddressFn(__m128 fValue, __m128i maxValue) { return _mm_mul_ps(frac(fValue), _mm_cvtepi32_ps(maxValue)); } static inline __m128 vrintf(__m128 f) { #ifdef __SSE4_1__ return _mm_round_ps(f, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); #else // No packed rounding until SSE4... do this the old-fashioned way return _mm_cvtepi32_ps(_mm_cvtps_epi32(f)); #endif } static inline __m128 vfabsf(__m128 f) { return _mm_andnot_ps(_mm_castsi128_ps(_mm_slli_epi32(_mm_setmone_si128(), 31) /* 0x80000000 */), f); } __m128 MirroredRepeatNormalizedAddressFn(__m128 fValue, __m128i maxValue) { // Round to nearest multiple of two. // Note halfway values flip flop here due to rte, but they both end up // pointing the same place at the end of the day. __m128 s_prime = vrintf(_mm_mul_ps(fValue, _mm_set1_ps(0.5f))); s_prime = _mm_add_ps(s_prime, s_prime); // Reduce to [-1, 1], Apply mirroring -> [0, 1] s_prime = vfabsf(_mm_sub_ps(fValue, s_prime)); // un-normalize return _mm_mul_ps(s_prime, _mm_cvtepi32_ps(maxValue)); } #else typedef int (*AddressFn)(int value, size_t maxValue); int NoAddressFn(int value, size_t maxValue) { return value; } int RepeatAddressFn(int value, size_t maxValue) { if (value < 0) value += (int)maxValue; else if (value >= (int)maxValue) value -= (int)maxValue; return value; } int MirroredRepeatAddressFn(int value, size_t maxValue) { if (value < 0) value = 0; else if ((size_t)value >= maxValue) value = (int)(maxValue - 1); return value; } int ClampAddressFn(int value, size_t maxValue) { return (value < -1) ? -1 : ((value > (cl_long)maxValue) ? (int)maxValue : value); } int ClampToEdgeNearestFn(int value, size_t maxValue) { return (value < 0) ? 0 : (((size_t)value > maxValue - 1) ? (int)maxValue - 1 : value); } AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn; // Note: normalized coords get repeated in normalized space, not unnormalized // space! hence the special case here volatile float gFloatHome; float RepeatNormalizedAddressFn(float fValue, size_t maxValue) { #ifndef _MSC_VER // Use original if not the VS compiler. // General computation for repeat return frac(fValue) * (float)maxValue; // Reduce to [0, 1.f] #else // Otherwise, use this instead: // Home the subtraction to a float to break up the sequence of x87 // instructions emitted by the VS compiler. gFloatHome = fValue - floorf(fValue); return gFloatHome * (float)maxValue; #endif } float MirroredRepeatNormalizedAddressFn(float fValue, size_t maxValue) { // Round to nearest multiple of two. // Note halfway values flip flop here due to rte, but they both end up // pointing the same place at the end of the day. float s_prime = 2.0f * rintf(fValue * 0.5f); // Reduce to [-1, 1], Apply mirroring -> [0, 1] s_prime = fabsf(fValue - s_prime); // un-normalize return s_prime * (float)maxValue; } #endif struct AddressingTable { AddressingTable() { static_assert(CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6, ""); static_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2, ""); mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE] [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = NoAddressFn; mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE] [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = NoAddressFn; mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE] [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = RepeatAddressFn; mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE] [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = RepeatAddressFn; mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE] [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampToEdgeNearestFn; mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE] [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampToEdgeLinearFn; mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE] [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampAddressFn; mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE] [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampAddressFn; mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE] [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = MirroredRepeatAddressFn; mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE] [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = MirroredRepeatAddressFn; } AddressFn operator[](image_sampler_data *sampler) { return mTable[(int)sampler->addressing_mode - CL_ADDRESS_NONE] [(int)sampler->filter_mode - CL_FILTER_NEAREST]; } AddressFn mTable[6][2]; }; static AddressingTable sAddressingTable; bool is_sRGBA_order(cl_channel_order image_channel_order) { switch (image_channel_order) { case CL_sRGB: case CL_sRGBx: case CL_sRGBA: case CL_sBGRA: return true; default: return false; } } // Format helpers int has_alpha(const cl_image_format *format) { switch (format->image_channel_order) { case CL_R: return 0; case CL_A: return 1; case CL_Rx: return 0; case CL_RG: return 0; case CL_RA: return 1; case CL_RGx: return 0; case CL_RGB: case CL_sRGB: return 0; case CL_RGBx: case CL_sRGBx: return 0; case CL_RGBA: return 1; case CL_BGRA: return 1; case CL_ARGB: return 1; case CL_ABGR: return 1; case CL_INTENSITY: return 1; case CL_LUMINANCE: return 0; #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: return 1; #endif #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: return 1; #endif case CL_sRGBA: case CL_sBGRA: return 1; case CL_DEPTH: return 0; default: log_error("Invalid image channel order: %d\n", format->image_channel_order); return 0; } } #define PRINT_MAX_SIZE_LOGIC 0 #define SWAP(_a, _b) \ do \ { \ _a ^= _b; \ _b ^= _a; \ _a ^= _b; \ } while (0) void get_max_sizes( size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3], size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize, const cl_ulong maxIndividualAllocSize, // CL_DEVICE_MAX_MEM_ALLOC_SIZE const cl_ulong maxTotalAllocSize, // CL_DEVICE_GLOBAL_MEM_SIZE cl_mem_object_type image_type, const cl_image_format *format, int usingMaxPixelSizeBuffer) { bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D); bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY); // Validate we have a reasonable max depth for 3D if (is3D && maxDepth < 2) { log_error("ERROR: Requesting max image sizes for 3D images when max " "depth is < 2.\n"); *numberOfSizes = 0; return; } // Validate we have a reasonable max array size for 1D & 2D image arrays if (isArray && maxArraySize < 2) { log_error("ERROR: Requesting max image sizes for an image array when " "max array size is < 1.\n"); *numberOfSizes = 0; return; } // Reduce the maximum because we are trying to test the max image // dimensions, not the memory allocation cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4; cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4; log_info("Note: max individual allocation adjusted down from %gMB to %gMB " "and max total allocation adjusted down from %gMB to %gMB.\n", maxIndividualAllocSize / (1024.0 * 1024.0), adjustedMaxIndividualAllocSize / (1024.0 * 1024.0), maxTotalAllocSize / (1024.0 * 1024.0), adjustedMaxTotalAllocSize / (1024.0 * 1024.0)); // Cap our max allocation to 1.0GB. // FIXME -- why? In the interest of not taking a long time? We should // still test this stuff... if (adjustedMaxTotalAllocSize > (cl_ulong)1024 * 1024 * 1024) { adjustedMaxTotalAllocSize = (cl_ulong)1024 * 1024 * 1024; log_info("Limiting max total allocation size to %gMB (down from %gMB) " "for test.\n", adjustedMaxTotalAllocSize / (1024.0 * 1024.0), maxTotalAllocSize / (1024.0 * 1024.0)); } cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize; if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize * 2) maxAllocSize = adjustedMaxTotalAllocSize / 2; size_t raw_pixel_size = get_pixel_size(format); // If the test will be creating input (src) buffer of type int4 or float4, // number of pixels will be governed by sizeof(int4 or float4) and not // sizeof(dest fomat) Also if pixel size is 12 bytes i.e. RGB or RGBx, we // adjust it to 16 bytes as GPUs has no concept of 3 channel images. GPUs // expand these to four channel RGBA. if (usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16; size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size; log_info("Maximums: [%zu x %zu x %zu], raw pixel size %zu bytes, " "per-allocation limit %gMB.\n", maxWidth, maxHeight, isArray ? maxArraySize : maxDepth, raw_pixel_size, (maxAllocSize / (1024.0 * 1024.0))); // Keep track of the maximum sizes for each dimension size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth }; switch (image_type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: maximum_sizes[1] = maxArraySize; maximum_sizes[2] = 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: maximum_sizes[2] = maxArraySize; break; } // Given one fixed sized dimension, this code finds one or two other // dimensions, both with very small size, such that the size does not // exceed the maximum passed to this function #if defined(__x86_64) || defined(__arm64__) || defined(__ppc64__) size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 10, 11, 13, 15 }; #else size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 11, 13 }; #endif static size_t other_size = 0; enum { num_other_sizes = sizeof(other_sizes) / sizeof(size_t) }; (*numberOfSizes) = 0; if (image_type == CL_MEM_OBJECT_IMAGE1D) { size_t M = maximum_sizes[0]; // Store the size sizes[(*numberOfSizes)][0] = M; sizes[(*numberOfSizes)][1] = 1; sizes[(*numberOfSizes)][2] = 1; ++(*numberOfSizes); } else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D) { for (int fixed_dim = 0; fixed_dim < 2; ++fixed_dim) { // Determine the size of the fixed dimension size_t M = maximum_sizes[fixed_dim]; size_t A = max_pixels; int x0_dim = !fixed_dim; size_t x0 = static_cast( fmin(fmin(other_sizes[(other_size++) % num_other_sizes], A / M), maximum_sizes[x0_dim])); // Store the size sizes[(*numberOfSizes)][fixed_dim] = M; sizes[(*numberOfSizes)][x0_dim] = x0; sizes[(*numberOfSizes)][2] = 1; ++(*numberOfSizes); } } else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE3D) { // Iterate over dimensions, finding sizes for the non-fixed dimension for (int fixed_dim = 0; fixed_dim < 3; ++fixed_dim) { // Determine the size of the fixed dimension size_t M = maximum_sizes[fixed_dim]; size_t A = max_pixels; // Find two other dimensions, x0 and x1 int x0_dim = (fixed_dim == 0) ? 1 : 0; int x1_dim = (fixed_dim == 2) ? 1 : 2; // Choose two other sizes for these dimensions size_t x0 = static_cast( fmin(fmin(A / M, maximum_sizes[x0_dim]), other_sizes[(other_size++) % num_other_sizes])); // GPUs have certain restrictions on minimum width (row alignment) // of images which has given us issues testing small widths in this // test (say we set width to 3 for testing, and compute size based // on this width and decide it fits within vram ... but GPU driver // decides that, due to row alignment requirements, it has to use // width of 16 which doesnt fit in vram). For this purpose we are // not testing width < 16 for this test. if (x0_dim == 0 && x0 < 16) x0 = 16; size_t x1 = static_cast( fmin(fmin(A / M / x0, maximum_sizes[x1_dim]), other_sizes[(other_size++) % num_other_sizes])); // Valid image sizes cannot be below 1. Due to the workaround for // the xo_dim where x0 is overidden to 16 there might not be enough // space left for x1 dimension. This could be a fractional 0.x size // that when cast to integer would result in a value 0. In these // cases we clamp the size to a minimum of 1. if (x1 < 1) x1 = 1; // M and x0 cannot be '0' as they derive from clDeviceInfo calls assert(x0 > 0 && M > 0); // Store the size sizes[(*numberOfSizes)][fixed_dim] = M; sizes[(*numberOfSizes)][x0_dim] = x0; sizes[(*numberOfSizes)][x1_dim] = x1; ++(*numberOfSizes); } } // Log the results for (int j = 0; j < (int)(*numberOfSizes); j++) { switch (image_type) { case CL_MEM_OBJECT_IMAGE1D: log_info(" size[%d] = [%zu] (%g MB image)\n", j, sizes[j][0], raw_pixel_size * sizes[j][0] * sizes[j][1] * sizes[j][2] / (1024.0 * 1024.0)); break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D: log_info(" size[%d] = [%zu %zu] (%g MB image)\n", j, sizes[j][0], sizes[j][1], raw_pixel_size * sizes[j][0] * sizes[j][1] * sizes[j][2] / (1024.0 * 1024.0)); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE3D: log_info(" size[%d] = [%zu %zu %zu] (%g MB image)\n", j, sizes[j][0], sizes[j][1], sizes[j][2], raw_pixel_size * sizes[j][0] * sizes[j][1] * sizes[j][2] / (1024.0 * 1024.0)); break; } } } float get_max_absolute_error(const cl_image_format *format, image_sampler_data *sampler) { if (sampler->filter_mode == CL_FILTER_NEAREST) return 0.0f; switch (format->image_channel_data_type) { case CL_SNORM_INT8: return 1.0f / 127.0f; case CL_UNORM_INT8: return 1.0f / 255.0f; case CL_UNORM_INT16: return 1.0f / 65535.0f; case CL_SNORM_INT16: return 1.0f / 32767.0f; case CL_FLOAT: return CL_FLT_MIN; #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: return 0x1.0p-14f; #endif case CL_UNORM_SHORT_555: case CL_UNORM_SHORT_565: return 1.0f / 31.0f; default: return 0.0f; } } float get_max_relative_error(const cl_image_format *format, image_sampler_data *sampler, int is3D, int isLinearFilter) { float maxError = 0.0f; float sampleCount = 1.0f; if (isLinearFilter) sampleCount = is3D ? 8.0f : 4.0f; // Note that the ULP is defined here as the unit in the last place of the // maximum magnitude sample used for filtering. // Section 8.3 switch (format->image_channel_data_type) { // The spec allows 2 ulps of error for normalized formats case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SNORM_INT16: case CL_UNORM_INT16: case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: case CL_UNORM_INT_101010: // Maximum sampling error for round to zero normalization based on // multiplication by reciprocal (using reciprocal generated in // round to +inf mode, so that 1.0 matches spec) maxError = 2 * FLT_EPSILON * sampleCount; break; // If the implementation supports these formats then it will have to // allow rounding error here too, because not all 32-bit ints are // exactly representable in float case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: maxError = 1 * FLT_EPSILON; break; } // Section 8.2 if (sampler->addressing_mode == CL_ADDRESS_REPEAT || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT || sampler->filter_mode != CL_FILTER_NEAREST || sampler->normalized_coords) #if defined(__APPLE__) { if (sampler->filter_mode != CL_FILTER_NEAREST) { // The maximum if (gDeviceType == CL_DEVICE_TYPE_GPU) // Some GPUs ain't so accurate maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4); else // The standard method of 2d linear filtering delivers 4.0 ulps // of error in round to nearest (8 in rtz). maxError += 4.0f * FLT_EPSILON; } else // normalized coordinates will introduce some error into the // fractional part of the address, affecting results maxError += 4.0f * FLT_EPSILON; } #else { #if !defined(_WIN32) #warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec #endif // The spec allows linear filters to return any result most of the time. // That's fine for implementations but a problem for testing. After all // users aren't going to like garbage images. We have "picked a number" // here that we are going to attempt to conform to. Implementations are // free to pick another number, like infinity, if they like. // We picked a number for you, to provide /some/ sanity maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7); // ...but this is what the spec allows: // maxError = INFINITY; // Please feel free to pick any positive number. (NaN wont work.) } #endif // The error calculation itself can introduce error maxError += FLT_EPSILON * 2; return maxError; } size_t get_format_max_int(const cl_image_format *format) { switch (format->image_channel_data_type) { case CL_SNORM_INT8: case CL_SIGNED_INT8: return 127; case CL_UNORM_INT8: case CL_UNSIGNED_INT8: return 255; case CL_SNORM_INT16: case CL_SIGNED_INT16: return 32767; case CL_UNORM_INT16: case CL_UNSIGNED_INT16: return 65535; case CL_SIGNED_INT32: return 2147483647L; case CL_UNSIGNED_INT32: return 4294967295LL; case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: return 31; case CL_UNORM_INT_101010: return 1023; case CL_HALF_FLOAT: return 1 << 10; #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: return 16384; #endif default: return 0; } } int get_format_min_int(const cl_image_format *format) { switch (format->image_channel_data_type) { case CL_SNORM_INT8: case CL_SIGNED_INT8: return -128; case CL_UNORM_INT8: case CL_UNSIGNED_INT8: return 0; case CL_SNORM_INT16: case CL_SIGNED_INT16: return -32768; case CL_UNORM_INT16: case CL_UNSIGNED_INT16: return 0; case CL_SIGNED_INT32: return -2147483648LL; case CL_UNSIGNED_INT32: return 0; case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: case CL_UNORM_INT_101010: return 0; case CL_HALF_FLOAT: return -(1 << 10); #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: return -16384; #endif default: return 0; } } cl_half convert_float_to_half(float f) { switch (gFloatToHalfRoundingMode) { case kRoundToNearestEven: return cl_half_from_float(f, CL_HALF_RTE); case kRoundTowardZero: return cl_half_from_float(f, CL_HALF_RTZ); default: log_error("ERROR: Test internal error -- unhandled or unknown " "float->half rounding mode.\n"); exit(-1); return 0xffff; } } cl_ulong get_image_size(image_descriptor const *imageInfo) { cl_ulong imageSize; // Assumes rowPitch and slicePitch are always correctly defined if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1) { imageSize = (size_t)compute_mipmapped_image_size(*imageInfo); } else { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: imageSize = imageInfo->rowPitch; break; case CL_MEM_OBJECT_IMAGE2D: imageSize = imageInfo->height * imageInfo->rowPitch; break; case CL_MEM_OBJECT_IMAGE3D: imageSize = imageInfo->depth * imageInfo->slicePitch; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: imageSize = imageInfo->arraySize * imageInfo->slicePitch; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: imageSize = imageInfo->arraySize * imageInfo->slicePitch; break; default: log_error("ERROR: Cannot identify image type %x\n", imageInfo->type); abort(); } } return imageSize; } // Calculate image size in megabytes (strictly, mebibytes). Result is rounded // up. cl_ulong get_image_size_mb(image_descriptor const *imageInfo) { cl_ulong imageSize = get_image_size(imageInfo); cl_ulong mb = imageSize / (1024 * 1024); if (imageSize % (1024 * 1024) > 0) { mb += 1; } return mb; } uint64_t gRoundingStartValue = 0; void escape_inf_nan_values(char *data, size_t allocSize) { // filter values with 8 not-quite-highest bits unsigned int *intPtr = (unsigned int *)data; for (size_t i = 0; i> 2; i++) { if ((intPtr[i] & 0x7F800000) == 0x7F800000) intPtr[i] ^= 0x40000000; } // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits // = 0x7C00 are special) unsigned short *shortPtr = (unsigned short *)data; for (size_t i = 0; i> 1; i++) { if ((shortPtr[i] & 0x7C00) == 0x7C00) shortPtr[i] ^= 0x4000; } } char *generate_random_image_data(image_descriptor *imageInfo, BufferOwningPtr &P, MTdata d) { size_t allocSize = static_cast(get_image_size(imageInfo)); size_t pixelRowBytes = imageInfo->width * get_pixel_size(imageInfo->format); size_t i; if (imageInfo->num_mip_levels > 1) allocSize = static_cast(compute_mipmapped_image_size(*imageInfo)); #if defined(__APPLE__) char *data = NULL; if (gDeviceType == CL_DEVICE_TYPE_CPU) { size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192; void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0); intptr_t data_end = (intptr_t)map + mapSize - 4096; data = (char *)(data_end - (intptr_t)allocSize); mprotect(map, 4096, PROT_NONE); mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE); P.reset(data, map, mapSize, allocSize); } else { data = (char *)malloc(allocSize); P.reset(data, NULL, 0, allocSize); } #else P.reset(NULL); // Free already allocated memory first, then try to allocate // new block. char *data = (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format)); P.reset(data, NULL, 0, allocSize, true); #endif if (data == NULL) { log_error("ERROR: Unable to malloc %zu bytes for " "generate_random_image_data\n", allocSize); return 0; } if (gTestRounding) { // Special case: fill with a ramp from 0 to the size of the type size_t typeSize = get_format_type_size(imageInfo->format); switch (typeSize) { case 1: { char *ptr = data; for (i = 0; i < allocSize; i++) ptr[i] = (cl_char)(i + gRoundingStartValue); } break; case 2: { cl_short *ptr = (cl_short *)data; for (i = 0; i < allocSize / 2; i++) ptr[i] = (cl_short)(i + gRoundingStartValue); } break; case 4: { cl_int *ptr = (cl_int *)data; for (i = 0; i < allocSize / 4; i++) ptr[i] = (cl_int)(i + gRoundingStartValue); } break; } // Note: inf or nan float values would cause problems, although we don't // know this will actually be a float, so we just know what to look for escape_inf_nan_values(data, allocSize); return data; } // Otherwise, we should be able to just fill with random bits no matter what cl_uint *p = (cl_uint *)data; for (i = 0; i + 4 <= allocSize; i += 4) p[i / 4] = genrand_int32(d); for (; i < allocSize; i++) data[i] = genrand_int32(d); // Note: inf or nan float values would cause problems, although we don't // know this will actually be a float, so we just know what to look for escape_inf_nan_values(data, allocSize); if (/*!gTestMipmaps*/ imageInfo->num_mip_levels < 2) { // Fill unused edges with -1, NaN for float if (imageInfo->rowPitch > pixelRowBytes) { size_t height = 0; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE3D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height = imageInfo->height; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: height = imageInfo->arraySize; break; } // Fill in the row padding regions for (i = 0; i < height; i++) { size_t offset = i * imageInfo->rowPitch + pixelRowBytes; size_t length = imageInfo->rowPitch - pixelRowBytes; memset(data + offset, 0xff, length); } } // Fill in the slice padding regions, if necessary: size_t slice_dimension = imageInfo->height; if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { slice_dimension = imageInfo->arraySize; } if (imageInfo->slicePitch > slice_dimension * imageInfo->rowPitch) { size_t depth = 0; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE3D: depth = imageInfo->depth; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D_ARRAY: depth = imageInfo->arraySize; break; } for (i = 0; i < depth; i++) { size_t offset = i * imageInfo->slicePitch + slice_dimension * imageInfo->rowPitch; size_t length = imageInfo->slicePitch - slice_dimension * imageInfo->rowPitch; memset(data + offset, 0xff, length); } } } return data; } #ifdef __SSE2__ #define CLAMP_FLOAT_V(v) \ (_mm_max_ps(_mm_min_ps(v, _mm_set1_ps(1.f)), _mm_set1_ps(-1.f))) #define CLAMP_FLOAT(v) \ (_mm_max_ss(_mm_min_ss(v, _mm_set_ss(1.f)), _mm_set_ss(-1.f))) #ifdef __SSE4_1__ #define SET_ALPHA_1(v) \ (_mm_insert_ps(v, _mm_set_ss(1.f), _MM_MK_INSERTPS_NDX(0, 3, 0))) #define SELECT_F(cond, a, b) _mm_blendv_ps(b, a, cond) #define EXTRACT_I(v, i) _mm_extract_epi32(v, i) #ifdef __x86_64 #define EXTRACT_I64(v, i) _mm_extract_epi64(v, i) #endif #else #define SET_ALPHA_1(v) (_mm_movelh_ps(v, _mm_set_ps(0.f, 0.f, 1.f, 0.f))) // n.b. "ANDNOT" is ~A & B, not A & ~B!! #define SELECT_F(cond, a, b) \ _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b)) #define EXTRACT_I(v, i) \ _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, i))) #ifdef __x86_64 #define EXTRACT_I64(v, i) \ _mm_cvtsi128_si64(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 2 * i + 1, 2 * i))) #endif #endif static __m128 read_image_pixel_float(void *imageData, image_descriptor *imageInfo, __m128i coord, int lod) { size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth; size_t slice_pitch_lod = 0, row_pitch_lod = 0; if (imageInfo->num_mip_levels > 1) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D: depth_lod = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height_lod = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; default: width_lod = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; } row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) slice_pitch_lod = row_pitch_lod; else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) slice_pitch_lod = row_pitch_lod * height_lod; } else { row_pitch_lod = imageInfo->rowPitch; slice_pitch_lod = imageInfo->slicePitch; } __m128i extent = _mm_set_epi32( 0, imageInfo->arraySize != 0 ? (int)imageInfo->arraySize : (int)depth_lod, (int)height_lod, (int)width_lod); __m128i zero = _mm_setzero_si128(); __m128i minMask = _mm_cmplt_epi32(coord, zero); __m128i maxMask = _mm_cmpgt_epi32(coord, _mm_add_epi32(extent, _mm_setmone_si128())); __m128i extentMask = _mm_cmpeq_epi32(extent, zero); __m128i boundsMask = _mm_or_si128(_mm_andnot_si128(extentMask, maxMask), minMask); if (TEST_NONZERO(boundsMask)) return has_alpha(imageInfo->format) ? _mm_setzero_ps() : _mm_set_ps(1.f, 0.f, 0.f, 0.f); const cl_image_format *format = imageInfo->format; __m128 tempData; // Predeclare a bunch of reciprocal constants so GCC doesn't use expensive // divisions to compute them in our code static const float recip_31 = 1.0f / 31.0f; static const float recip_63 = 1.0f / 63.0f; static const float recip_127 = 1.0f / 127.0f; static const float recip_255 = 1.0f / 255.0f; static const float recip_1023 = 1.0f / 1023.0f; static const float recip_32767 = 1.0f / 32767.0f; static const float recip_65535 = 1.0f / 65535.0f; // Advance to the right spot char *ptr = (char *)imageData; size_t pixelSize = get_pixel_size(format); __m128i pitch_lod = _mm_set_epi32(0, (int)slice_pitch_lod, (int)row_pitch_lod, (int)pixelSize); __m128i offsetA = _mm_mul_epu32(coord, pitch_lod); __m128i offsetB = _mm_mul_epu32(_mm_shuffle_epi32(coord, _MM_SHUFFLE(2, 3, 0, 1)), _mm_shuffle_epi32(pitch_lod, _MM_SHUFFLE(2, 3, 0, 1))); #ifdef __x86_64 ptr += EXTRACT_I64(offsetB, 0) + EXTRACT_I64(offsetA, 1) + EXTRACT_I64(offsetA, 0); #else // Using PHADDD doesn't gain us much... ptr += EXTRACT_I(offsetB, 0) + EXTRACT_I(offsetA, 2) + EXTRACT_I(offsetA, 0); #endif // OpenCL only supports reading floats from certain formats size_t channelCount = get_format_channel_count(format); switch (format->image_channel_data_type) { case CL_SNORM_INT8: { cl_char *dPtr = (cl_char *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1(CLAMP_FLOAT( _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), _mm_set_ss(recip_127)))); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x7F00, 1); break; case 3: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x7F00 | dPtr[2], 1); break; case 4: pixel = _mm_loadu_si32(ptr); break; } if (channelCount != 1) { #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(pixel)); #else __m128i signMask; signMask = _mm_cmpgt_epi8(_mm_setzero_si128(), pixel); pixel = _mm_unpacklo_epi8(pixel, signMask); signMask = _mm_unpacklo_epi8(signMask, signMask); tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pixel, signMask)); #endif tempData = CLAMP_FLOAT_V(_mm_mul_ps(tempData, _mm_set1_ps(recip_127))); } break; } case CL_UNORM_INT8: { unsigned char *dPtr = (unsigned char *)ptr; __m128i pixel; switch (channelCount) { case 1: if ((is_sRGBA_order( imageInfo->format->image_channel_order))) tempData = SET_ALPHA_1(_mm_set_ss(sRGBunmap(dPtr[0]))); else tempData = SET_ALPHA_1(_mm_mul_ss( _mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), _mm_set_ss(recip_255))); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0xFF00, 1); break; case 3: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0xFF00 | dPtr[2], 1); break; case 4: pixel = _mm_loadu_si32(ptr); #ifdef CL_1RGB_APPLE if (format->image_channel_order == CL_1RGB_APPLE) #ifdef __SSE4_1__ pixel = _mm_insert_epi8(pixel, 0xFF, 0); #else pixel = _mm_or_si128(pixel, _mm_bsrli_si128(_mm_setmone_si128(), 15) /* 0x000000FF */); #endif #endif #ifdef CL_BGR1_APPLE if (format->image_channel_order == CL_BGR1_APPLE) #ifdef __SSE4_1__ pixel = _mm_insert_epi8(pixel, 0xFF, 3); #else pixel = _mm_or_si128( pixel, _mm_bslli_si128( _mm_bsrli_si128(_mm_setmone_si128(), 15), 3) /* 0xFF000000 */); #endif #endif break; } if (channelCount != 1) { if (is_sRGBA_order(imageInfo->format->image_channel_order)) tempData = sRGBunmap(pixel); else { #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(pixel)); #else __m128i zero = _mm_setzero_si128(); tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16( _mm_unpacklo_epi8(pixel, zero), zero)); #endif tempData = _mm_mul_ps(tempData, _mm_set1_ps(recip_255)); } } break; } case CL_SIGNED_INT8: { cl_char *dPtr = (cl_char *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100, 1); break; case 3: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100 | dPtr[2], 1); break; case 4: pixel = _mm_loadu_si32(ptr); break; } if (channelCount != 1) { #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(pixel)); #else __m128i signMask; signMask = _mm_cmpgt_epi8(_mm_setzero_si128(), pixel); pixel = _mm_unpacklo_epi8(pixel, signMask); signMask = _mm_unpacklo_epi8(signMask, signMask); tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pixel, signMask)); #endif } break; } case CL_UNSIGNED_INT8: { cl_uchar *dPtr = (cl_uchar *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100, 1); break; case 3: pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100 | dPtr[2], 1); break; case 4: pixel = _mm_loadu_si32(ptr); break; } if (channelCount != 1) { #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(pixel)); #else __m128i zero = _mm_setzero_si128(); tempData = _mm_cvtepi32_ps( _mm_unpacklo_epi16(_mm_unpacklo_epi8(pixel, zero), zero)); #endif } break; } case CL_SNORM_INT16: { cl_short *dPtr = (cl_short *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1(CLAMP_FLOAT( _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), _mm_set_ss(recip_32767)))); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 0x7FFF, 3); break; case 3: pixel = _mm_insert_epi16( _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 0x7FFF, 3); break; case 4: pixel = _mm_loadu_si64(ptr); break; } if (channelCount != 1) { #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(pixel)); #else tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16( pixel, _mm_cmpgt_epi16(_mm_setzero_si128(), pixel))); #endif tempData = CLAMP_FLOAT_V( _mm_mul_ps(tempData, _mm_set1_ps(recip_32767))); } break; } case CL_UNORM_INT16: { cl_ushort *dPtr = (cl_ushort *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1( _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]), _mm_set_ss(recip_65535))); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 0xFFFF, 3); break; case 3: pixel = _mm_insert_epi16( _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 0xFFFF, 3); break; case 4: pixel = _mm_loadu_si64(ptr); break; } if (channelCount != 1) { #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(pixel)); #else tempData = _mm_cvtepi32_ps( _mm_unpacklo_epi16(pixel, _mm_setzero_si128())); #endif tempData = _mm_mul_ps(tempData, _mm_set1_ps(recip_65535)); } break; } case CL_SIGNED_INT16: { cl_short *dPtr = (cl_short *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 1, 3); break; case 3: pixel = _mm_insert_epi16( _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 1, 3); break; case 4: pixel = _mm_loadu_si64(ptr); break; } if (channelCount != 1) #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(pixel)); #else tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16( pixel, _mm_cmpgt_epi16(_mm_setzero_si128(), pixel))); #endif break; } case CL_UNSIGNED_INT16: { cl_ushort *dPtr = (cl_ushort *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 1, 3); break; case 3: pixel = _mm_insert_epi16( _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 1, 3); break; case 4: pixel = _mm_loadu_si64(ptr); break; } if (channelCount != 1) #ifdef __SSE4_1__ tempData = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(pixel)); #else tempData = _mm_cvtepi32_ps( _mm_unpacklo_epi16(pixel, _mm_setzero_si128())); #endif break; } case CL_HALF_FLOAT: { cl_half *dPtr = (cl_half *)ptr; __m128i h; switch (channelCount) { case 1: #ifdef __F16C__ tempData = SET_ALPHA_1(_mm_set_ss(_cvtsh_ss(dPtr[0]))); #else tempData = SET_ALPHA_1(_mm_set_ss(cl_half_to_float(dPtr[0]))); #endif break; case 2: h = _mm_insert_epi16(_mm_loadu_si32(ptr), 0x3C00, 3); break; case 3: h = _mm_insert_epi16( _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 0x3C00, 3); break; case 4: h = _mm_loadu_si64(ptr); break; } if (channelCount == 1) break; tempData = cl_half_to_float(h); break; } case CL_SIGNED_INT32: { cl_int *dPtr = (cl_int *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0])); break; case 2: #ifdef __SSE4_1__ pixel = _mm_insert_epi32(_mm_loadu_si64(ptr), 1, 3); #else pixel = _mm_insert_epi16(_mm_loadu_si64(ptr), 1, 6); #endif break; case 3: #ifdef __SSE4_1__ pixel = _mm_insert_epi32( _mm_insert_epi32(_mm_loadu_si64(ptr), dPtr[2], 2), 1, 3); #else pixel = _mm_or_si128(_mm_loadu_si64(ptr), _mm_set_epi32(1, dPtr[2], 0, 0)); #endif break; case 4: pixel = _mm_loadu_si128((__m128i_u *)ptr); break; } if (channelCount != 1) tempData = _mm_cvtepi32_ps(pixel); break; } case CL_UNSIGNED_INT32: { cl_uint *dPtr = (cl_uint *)ptr; __m128i pixel; switch (channelCount) { case 1: #ifdef __x86_64 tempData = SET_ALPHA_1(_mm_cvtsi64_ss(_mm_setzero_ps(), dPtr[0])); #else tempData = SET_ALPHA_1(_mm_set_ss((float)dPtr[0])); #endif break; case 2: #ifdef __SSE4_1__ pixel = _mm_insert_epi32(_mm_loadu_si64(ptr), 1, 3); #else pixel = _mm_insert_epi16(_mm_loadu_si64(ptr), 1, 6); #endif break; case 3: #ifdef __SSE4_1__ pixel = _mm_insert_epi32( _mm_insert_epi32(_mm_loadu_si64(ptr), dPtr[2], 2), 1, 3); #else pixel = _mm_or_si128(_mm_loadu_si64(ptr), _mm_set_epi32(1, dPtr[2], 0, 0)); #endif break; case 4: pixel = _mm_loadu_si128((__m128i_u *)ptr); break; } if (channelCount != 1) { // Unfortunately, no instruction for converting unsigned 32-bit // integers to float exists until AVX-512; nor is there an // instruction for converting packed 64-bit integers to float // until same #ifdef __AVX512VL__ tempData = _mm_cvtepu32_ps(pixel); #elif defined(__SSE4_1__) // The following is based on the unoptimized output of GCC for // scalars __m128i negOne = _mm_setmone_si128(); if (!_mm_testz_si128( pixel, _mm_slli_epi32(negOne, 31) /* 0x80000000 */)) { __m128i one = _mm_srli_epi32(negOne, 31); // = 1; __m128i reducedPixel = _mm_or_si128( _mm_srli_epi32(pixel, 1), _mm_and_si128(pixel, one)); tempData = _mm_cvtepi32_ps(reducedPixel); tempData = _mm_add_ps(tempData, tempData); tempData = SELECT_F(_mm_castsi128_ps(pixel), tempData, _mm_cvtepi32_ps(pixel)); } else tempData = _mm_cvtepi32_ps(pixel); #else // Testing contents of vectors is unwieldy without SSE4.1 __m128i negOne = _mm_setmone_si128(); __m128i one = _mm_srli_epi32(negOne, 31); // = 1; __m128i reducedPixel = _mm_or_si128(_mm_srli_epi32(pixel, 1), _mm_and_si128(pixel, one)); tempData = _mm_cvtepi32_ps(reducedPixel); tempData = _mm_add_ps(tempData, tempData); __m128 tempData2 = _mm_cvtepi32_ps(pixel); __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), tempData2); tempData = SELECT_F(mask, tempData, tempData2); #endif } break; } case CL_UNORM_SHORT_565: { cl_ushort *dPtr = (cl_ushort *)ptr; #ifdef __AVX2__ __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si16(dPtr)); pixel = _mm_insert_epi32( _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 5, 11)), 1, 3); #else // Shifts may as well be scalar, since before AVX2 there are no // vector shift amounts __m128i pixel = _mm_set_epi32(1, dPtr[0], dPtr[0] >> 5, dPtr[0] >> 11); #endif pixel = _mm_and_si128(pixel, _mm_set_epi32(1, 0x1f, 0x3f, 0x1f)); tempData = _mm_mul_ps(_mm_cvtepi32_ps(pixel), _mm_set_ps(1.0f, recip_31, recip_63, recip_31)); break; } case CL_UNORM_SHORT_555: { cl_ushort *dPtr = (cl_ushort *)ptr; #ifdef __AVX2__ __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si16(dPtr)); pixel = _mm_insert_epi32( _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 5, 10)), 0x1F, 3); #else __m128i pixel = _mm_set_epi32(0x1F, dPtr[0], dPtr[0] >> 5, dPtr[0] >> 10); #endif pixel = _mm_and_si128(pixel, _mm_set1_epi16(0x1F)); tempData = _mm_mul_ps(_mm_cvtepi32_ps(pixel), _mm_set1_ps(recip_31)); break; } case CL_UNORM_INT_101010: { cl_uint *dPtr = (cl_uint *)ptr; #ifdef __AVX2__ __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si32(dPtr)); pixel = _mm_insert_epi32( _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 10, 20)), 0x3ff, 3); #else __m128i pixel = _mm_set_epi32(0x3ff, dPtr[0], dPtr[0] >> 10, dPtr[0] >> 20); #endif pixel = _mm_and_si128(pixel, _mm_set1_epi16(0x3ff)); tempData = _mm_mul_ps(_mm_cvtepi32_ps(pixel), _mm_set1_ps(recip_1023)); break; } case CL_FLOAT: { float *dPtr = (float *)ptr; switch (channelCount) { case 1: tempData = SET_ALPHA_1(_mm_load_ss(dPtr)); break; case 2: tempData = _mm_loadl_pi(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f), (__m64 *)ptr); break; case 3: tempData = _mm_loadl_pi( _mm_set_ps(1.0f, dPtr[2], 0.0f, 0.0f), (__m64 *)ptr); break; case 4: tempData = _mm_loadu_ps(dPtr); break; } break; } #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { cl_ushort *dPtr = (cl_ushort *)ptr; __m128i pixel; switch (channelCount) { case 1: tempData = SET_ALPHA_1( _mm_mul_ss(_mm_cvtsi32_ss((int)dPtr[0] - 16384), _mm_set_ss(0x1.0p-14f))); break; case 2: pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), (1 << 14) + 16384, 3); break; case 3: pixel = _mm_insert_epi16( _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), (1 << 14) + 16384, 3); break; case 4: pixel = _mm_loadu_si64(ptr); break; } if (channelCount != 1) { #ifdef __SSE4_1__ pixel = _mm_cvtepu16_epi32(pixel); #else pixel = _mm_unpacklo_epi16(pixel, _mm_setzero_si128()); #endif tempData = _mm_mul_ps(_mm_cvtepi32_ps(_mm_add_epi32(pixel, _mm_slli_epi32(_mm_setmone_si128(), 14) /* -16384 */, _mm_set1_ps(0x1.0p-14f)); } break; } #endif } switch (format->image_channel_order) { case CL_R: case CL_Rx: case CL_RG: case CL_RGx: case CL_RGB: case CL_RGBx: case CL_sRGB: case CL_sRGBx: case CL_RGBA: case CL_sRGBA: case CL_DEPTH: /* Already correct */ return tempData; case CL_A: return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 1, 1, 1)); case CL_RA: return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(1, 2, 2, 0)); case CL_ARGB: #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: #endif return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 3, 2, 1)); case CL_ABGR: return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 1, 2, 3)); case CL_BGRA: case CL_sBGRA: #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: #endif return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(3, 0, 1, 2)); case CL_INTENSITY: return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 0, 0, 0)); case CL_LUMINANCE: return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(3, 0, 0, 0)); default: log_error("Invalid format:"); print_header(format, true); break; } return tempData; } void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData, int lod) { _mm_storeu_ps(outData, read_image_pixel_float(imageData, imageInfo, _mm_set_epi32(0, z, y, x), lod)); } #else #define CLAMP_FLOAT(v) (fmaxf(fminf(v, 1.f), -1.f)) void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData, int lod) { size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth; size_t slice_pitch_lod = 0, row_pitch_lod = 0; if (imageInfo->num_mip_levels > 1) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D: depth_lod = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height_lod = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; default: width_lod = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; } row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) slice_pitch_lod = row_pitch_lod; else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) slice_pitch_lod = row_pitch_lod * height_lod; } else { row_pitch_lod = imageInfo->rowPitch; slice_pitch_lod = imageInfo->slicePitch; } if (x < 0 || y < 0 || z < 0 || x >= (int)width_lod || (height_lod != 0 && y >= (int)height_lod) || (depth_lod != 0 && z >= (int)depth_lod) || (imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize)) { outData[0] = outData[1] = outData[2] = outData[3] = 0; if (!has_alpha(imageInfo->format)) outData[3] = 1; return; } const cl_image_format *format = imageInfo->format; unsigned int i; float tempData[4]; // Advance to the right spot char *ptr = (char *)imageData; size_t pixelSize = get_pixel_size(format); ptr += z * slice_pitch_lod + y * row_pitch_lod + x * pixelSize; // OpenCL only supports reading floats from certain formats size_t channelCount = get_format_channel_count(format); switch (format->image_channel_data_type) { case CL_SNORM_INT8: { cl_char *dPtr = (cl_char *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 127.0f); break; } case CL_UNORM_INT8: { unsigned char *dPtr = (unsigned char *)ptr; for (i = 0; i < channelCount; i++) { if ((is_sRGBA_order(imageInfo->format->image_channel_order)) && i < 3) // only RGB need to be converted for sRGBA tempData[i] = sRGBunmap(dPtr[i]); else tempData[i] = (float)dPtr[i] / 255.0f; } break; } case CL_SIGNED_INT8: { cl_char *dPtr = (cl_char *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i]; break; } case CL_UNSIGNED_INT8: { cl_uchar *dPtr = (cl_uchar *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i]; break; } case CL_SNORM_INT16: { cl_short *dPtr = (cl_short *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 32767.0f); break; } case CL_UNORM_INT16: { cl_ushort *dPtr = (cl_ushort *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i] / 65535.0f; break; } case CL_SIGNED_INT16: { cl_short *dPtr = (cl_short *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i]; break; } case CL_UNSIGNED_INT16: { cl_ushort *dPtr = (cl_ushort *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i]; break; } case CL_HALF_FLOAT: { cl_half *dPtr = (cl_half *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = cl_half_to_float(dPtr[i]); break; } case CL_SIGNED_INT32: { cl_int *dPtr = (cl_int *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i]; break; } case CL_UNSIGNED_INT32: { cl_uint *dPtr = (cl_uint *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i]; break; } case CL_UNORM_SHORT_565: { cl_ushort *dPtr = (cl_ushort *)ptr; tempData[0] = (float)(dPtr[0] >> 11) / (float)31; tempData[1] = (float)((dPtr[0] >> 5) & 63) / (float)63; tempData[2] = (float)(dPtr[0] & 31) / (float)31; break; } case CL_UNORM_SHORT_555: { cl_ushort *dPtr = (cl_ushort *)ptr; tempData[0] = (float)((dPtr[0] >> 10) & 31) / (float)31; tempData[1] = (float)((dPtr[0] >> 5) & 31) / (float)31; tempData[2] = (float)(dPtr[0] & 31) / (float)31; break; } case CL_UNORM_INT_101010: { cl_uint *dPtr = (cl_uint *)ptr; tempData[0] = (float)((dPtr[0] >> 20) & 0x3ff) / (float)1023; tempData[1] = (float)((dPtr[0] >> 10) & 0x3ff) / (float)1023; tempData[2] = (float)(dPtr[0] & 0x3ff) / (float)1023; break; } case CL_FLOAT: { float *dPtr = (float *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i]; break; } #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { cl_ushort *dPtr = (cl_ushort *)ptr; for (i = 0; i < channelCount; i++) tempData[i] = ((int)dPtr[i] - 16384) * 0x1.0p-14f; break; } #endif } outData[0] = outData[1] = outData[2] = 0; outData[3] = 1; switch (format->image_channel_order) { case CL_A: outData[3] = tempData[0]; break; case CL_R: case CL_Rx: outData[0] = tempData[0]; break; case CL_RA: outData[0] = tempData[0]; outData[3] = tempData[1]; break; case CL_RG: case CL_RGx: outData[0] = tempData[0]; outData[1] = tempData[1]; break; case CL_RGB: case CL_RGBx: case CL_sRGB: case CL_sRGBx: outData[0] = tempData[0]; outData[1] = tempData[1]; outData[2] = tempData[2]; break; case CL_RGBA: outData[0] = tempData[0]; outData[1] = tempData[1]; outData[2] = tempData[2]; outData[3] = tempData[3]; break; case CL_ARGB: outData[0] = tempData[1]; outData[1] = tempData[2]; outData[2] = tempData[3]; outData[3] = tempData[0]; break; case CL_ABGR: outData[0] = tempData[3]; outData[1] = tempData[2]; outData[2] = tempData[1]; outData[3] = tempData[0]; break; case CL_BGRA: case CL_sBGRA: outData[0] = tempData[2]; outData[1] = tempData[1]; outData[2] = tempData[0]; outData[3] = tempData[3]; break; case CL_INTENSITY: outData[0] = tempData[0]; outData[1] = tempData[0]; outData[2] = tempData[0]; outData[3] = tempData[0]; break; case CL_LUMINANCE: outData[0] = tempData[0]; outData[1] = tempData[0]; outData[2] = tempData[0]; break; #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: outData[0] = tempData[1]; outData[1] = tempData[2]; outData[2] = tempData[3]; outData[3] = 1.0f; break; #endif #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: outData[0] = tempData[2]; outData[1] = tempData[1]; outData[2] = tempData[0]; outData[3] = 1.0f; break; #endif case CL_sRGBA: outData[0] = tempData[0]; outData[1] = tempData[1]; outData[2] = tempData[2]; outData[3] = tempData[3]; break; case CL_DEPTH: outData[0] = tempData[0]; break; default: log_error("Invalid format:"); print_header(format, true); break; } } #endif void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData) { read_image_pixel_float(imageData, imageInfo, x, y, z, outData, 0); } bool get_integer_coords(float x, float y, float z, size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ) { return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height, depth, imageSampler, imageInfo, outX, outY, outZ); } bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ) { AddressFn adFn = sAddressingTable[imageSampler]; #ifdef __SSE2__ __m128 coord = _mm_set_ps(0.f, z, y, x); __m128 addressOffset = _mm_set_ps(0.f, zAddressOffset, yAddressOffset, xAddressOffset); __m128i extent = _mm_set_epi32(0, depth, height, width); __m128 extentf = _mm_cvtepi32_ps(extent); __m128i ref = vifloorf(coord); __m128 arrayMask; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: case CL_MEM_OBJECT_IMAGE1D_ARRAY: arrayMask = _mm_castsi128_ps( _mm_bsrli_si128(_mm_setmone_si128(), 12)); // = 0, 0, 0, -1 break; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: arrayMask = _mm_castsi128_ps( _mm_bsrli_si128(_mm_setmone_si128(), 8)); // = 0, 0, -1, -1 break; default: arrayMask = _mm_castsi128_ps( _mm_bsrli_si128(_mm_setmone_si128(), 4)); // = 0, -1, -1, -1 } __m128 locMask = _mm_andnot_ps(_mm_cmpeq_ps(extentf, _mm_setzero_ps()), arrayMask); __m128 offsetMask = _mm_andnot_ps(_mm_cmpeq_ps(addressOffset, _mm_setzero_ps()), arrayMask); // Handle sampler-directed coordinate normalization + clamping. Note that // the array coordinate for image array types is expected to be // unnormalized, and is clamped to 0..arraySize-1. if (imageSampler->normalized_coords) { __m128 minMask, maxMask, temp; switch (imageSampler->addressing_mode) { case CL_ADDRESS_REPEAT: coord = SELECT_F( locMask, RepeatNormalizedAddressFn(coord, extent), coord); // Add in the offset coord = _mm_add_ps(coord, addressOffset); // Handle wrapping minMask = _mm_andnot_ps(_mm_cmplt_ps(coord, _mm_setzero_ps()), offsetMask); maxMask = _mm_andnot_ps(_mm_cmpgt_ps(coord, extentf), offsetMask); coord = SELECT_F( minMask, _mm_add_ps(coord, extentf), SELECT_F(maxMask, _mm_sub_ps(coord, extentf), coord)); break; case CL_ADDRESS_MIRRORED_REPEAT: coord = SELECT_F( locMask, MirroredRepeatNormalizedAddressFn(coord, extent), coord); temp = _mm_add_ps(coord, addressOffset); maxMask = _mm_cmpgt_ps(temp, extentf); coord = SELECT_F( offsetMask, vfabsf(SELECT_F( maxMask, _mm_sub_ps(extentf, _mm_sub_ps(temp, extentf)), temp)), coord); break; default: // Also, remultiply to the original coords. This simulates any // truncation in the pass to OpenCL #ifdef __FMA4__ coord = SELECT_F(arrayMask, _mm_macc_ps(coord, extentf, addressOffset), coord); #elif defined(__FMA__) coord = SELECT_F(arrayMask, _mm_fmadd_ps(coord, extentf, addressOffset), coord); #else coord = SELECT_F( arrayMask, _mm_add_ps(_mm_mul_ps(coord, extentf), addressOffset), coord); #endif break; } } // At this point, we're dealing with non-normalized coordinates. __m128i out = SELECT_I(_mm_castps_si128(locMask), adFn(vifloorf(coord), extent), _mm_cvtps_epi32(coord)); outX = _mm_cvtsi128_si32(out); // 1D and 2D arrays require special care for the index coordinate: switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: outY = static_cast( calculate_array_index(y, (float)imageInfo->arraySize - 1.0f)); outZ = 0; /* don't care! */ break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: outY = EXTRACT_I(out, 1); outZ = static_cast( calculate_array_index(z, (float)imageInfo->arraySize - 1.0f)); break; default: // legacy path: outY = EXTRACT_I(out, 1); outZ = EXTRACT_I(out, 2); } out = _mm_set_epi32(0, outZ, outY, outX); __m128i refEqual = _mm_cmpeq_epi32(ref, out); return TEST_ANY_ZERO(refEqual); #else float refX = floorf(x), refY = floorf(y), refZ = floorf(z); // Handle sampler-directed coordinate normalization + clamping. Note that // the array coordinate for image array types is expected to be // unnormalized, and is clamped to 0..arraySize-1. if (imageSampler->normalized_coords) { switch (imageSampler->addressing_mode) { case CL_ADDRESS_REPEAT: x = RepeatNormalizedAddressFn(x, width); if (height != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) y = RepeatNormalizedAddressFn(y, height); } if (depth != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) z = RepeatNormalizedAddressFn(z, depth); } if (xAddressOffset != 0.0) { // Add in the offset x += xAddressOffset; // Handle wrapping if (x > width) x -= (float)width; if (x < 0) x += (float)width; } if ((yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)) { // Add in the offset y += yAddressOffset; // Handle wrapping if (y > height) y -= (float)height; if (y < 0) y += (float)height; } if ((zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)) { // Add in the offset z += zAddressOffset; // Handle wrapping if (z > depth) z -= (float)depth; if (z < 0) z += (float)depth; } break; case CL_ADDRESS_MIRRORED_REPEAT: x = MirroredRepeatNormalizedAddressFn(x, width); if (height != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) y = MirroredRepeatNormalizedAddressFn(y, height); } if (depth != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) z = MirroredRepeatNormalizedAddressFn(z, depth); } if (xAddressOffset != 0.0) { float temp = x + xAddressOffset; if (temp > (float)width) temp = (float)width - (temp - (float)width); x = fabsf(temp); } if ((yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)) { float temp = y + yAddressOffset; if (temp > (float)height) temp = (float)height - (temp - (float)height); y = fabsf(temp); } if ((zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)) { float temp = z + zAddressOffset; if (temp > (float)depth) temp = (float)depth - (temp - (float)depth); z = fabsf(temp); } break; default: // Also, remultiply to the original coords. This simulates any // truncation in the pass to OpenCL x *= (float)width; x += xAddressOffset; if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) { y *= (float)height; y += yAddressOffset; } if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) { z *= (float)depth; z += zAddressOffset; } break; } } // At this point, we're dealing with non-normalized coordinates. outX = adFn(static_cast(floorf(x)), width); // 1D and 2D arrays require special care for the index coordinate: switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: outY = static_cast( calculate_array_index(y, (float)imageInfo->arraySize - 1.0f)); outZ = 0; /* don't care! */ break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: outY = adFn(static_cast(floorf(y)), height); outZ = static_cast( calculate_array_index(z, (float)imageInfo->arraySize - 1.0f)); break; default: // legacy path: if (height != 0) outY = adFn(static_cast(floorf(y)), height); if (depth != 0) outZ = adFn(static_cast(floorf(z)), depth); } return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ); #endif } #ifdef __SSE2__ static inline __m128 pixelMax(__m128 a, __m128 b) { // n.b. Operands must be reversed, because if either one is NaN, the second // operand is returned return _mm_max_ps(vfabsf(b), vfabsf(a)); } static inline int IsFloatSubnormal(__m128 x) { // No fpclass until AVX-512 (what took them so long?!!) union { __m128 f; __m128i u; } u; u.f = vfabsf(x); __m128i negOne = _mm_setmone_si128(); return TEST_NONZERO( _mm_cmplt_epi32(_mm_add_epi32(u.u, negOne), _mm_srli_epi32(negOne, 9) /* 0x007fffff */)); } // If containsDenorms is NULL, flush denorms to zero // if containsDenorms is not NULL, record whether there are any denorms static inline __m128 check_for_denorms(__m128 a, int *containsDenorms) { if (NULL != containsDenorms && IsFloatSubnormal(a)) *containsDenorms = 1; return a; } #else static inline void pixelMax(const float a[4], const float b[4], float *results); static inline void pixelMax(const float a[4], const float b[4], float *results) { for (int i = 0; i < 4; i++) results[i] = errMax(fabsf(a[i]), fabsf(b[i])); } // If containsDenorms is NULL, flush denorms to zero // if containsDenorms is not NULL, record whether there are any denorms static inline void check_for_denorms(float a[4], int *containsDenorms); static inline void check_for_denorms(float a[4], int *containsDenorms) { if (NULL == containsDenorms) { for (int i = 0; i < 4; i++) { if (IsFloatSubnormal(a[i])) a[i] = copysignf(0.0f, a[i]); } } else { for (int i = 0; i < 4; i++) { if (IsFloatSubnormal(a[i])) { *containsDenorms = 1; break; } } } } #endif inline float calculate_array_index(float coord, float extent) { // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array' // // given coordinate 'w' that represents an index: // layer_index = clamp( rint(w), 0, image_array_size - 1) float ret = rintf(coord); ret = ret > extent ? extent : ret; ret = ret < 0.0f ? 0.0f : ret; return ret; } #ifdef __SSE2__ #define EXTRACT_F(v, i) \ _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 1, i))) #endif /* * Utility function to unnormalized a coordinate given a particular sampler. * * name - the name of the coordinate, used for verbose debugging only * coord - the coordinate requiring unnormalization * offset - an addressing offset to be added to the coordinate * extent - the max value for this coordinate (e.g. width for x) */ #ifdef __SSE2__ #ifdef __AVX__ #define TEST_NONZERO_F(v) !_mm_testz_ps(v, v) #else #define TEST_NONZERO_F(v) TEST_NONZERO(_mm_castps_si128(v)) #endif static __m128 unnormalize_coordinate(const char *name, __m128 coord, __m128 offset, __m128 extent, cl_addressing_mode addressing_mode, int verbose) { __m128 zero = _mm_setzero_ps(); __m128 ret = zero; __m128 offsetMask = _mm_cmpneq_ps(offset, zero); __m128 minMask, maxMask, temp; switch (addressing_mode) { case CL_ADDRESS_REPEAT: ret = RepeatNormalizedAddressFn(coord, _mm_cvtps_epi32(extent)); if (verbose) { log_info("\tRepeat filter denormalizes %s (%f, %f, %f) to %f, " "%f, %f\n", name, EXTRACT_F(coord, 0), EXTRACT_F(coord, 1), EXTRACT_F(coord, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); } // Add in the offset, and handle wrapping. ret = _mm_add_ps(ret, offset); maxMask = _mm_and_ps(offsetMask, _mm_cmpgt_ps(ret, extent)); minMask = _mm_and_ps(offsetMask, _mm_cmplt_ps(ret, zero)); ret = SELECT_F(minMask, _mm_add_ps(ret, extent), SELECT_F(maxMask, _mm_sub_ps(ret, extent), ret)); if (verbose && TEST_NONZERO_F(offsetMask)) { log_info( "\tAddress offset of %f, %f, %f added to get %f, %f, %f\n", EXTRACT_F(offset, 0), EXTRACT_F(offset, 1), EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); } break; case CL_ADDRESS_MIRRORED_REPEAT: ret = MirroredRepeatNormalizedAddressFn(coord, _mm_cvtps_epi32(extent)); if (verbose) { log_info("\tMirrored repeat filter denormalizes %s (%f, %f, " "%f) to %f, %f, %f\n", name, EXTRACT_F(coord, 0), EXTRACT_F(coord, 1), EXTRACT_F(coord, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); } temp = _mm_add_ps(ret, offset); maxMask = _mm_cmpgt_ps(temp, extent); ret = SELECT_F( offsetMask, vfabsf(SELECT_F(maxMask, _mm_sub_ps(extent, _mm_sub_ps(temp, extent)), temp)), ret); if (verbose && TEST_NONZERO_F(offsetMask)) { log_info( "\tAddress offset of %f, %f, %f added to get %f, %f, %f\n", EXTRACT_F(offset, 0), EXTRACT_F(offset, 1), EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); } break; default: if (verbose) { ret = _mm_mul_ps(coord, extent); log_info("\tFilter denormalizes %s to %f, %f, %f (<%f, %f, %f> " "* <%f, %f, %f>)\n", name, EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), EXTRACT_F(ret, 2), EXTRACT_F(coord, 0), EXTRACT_F(coord, 1), EXTRACT_F(coord, 2), EXTRACT_F(extent, 0), EXTRACT_F(extent, 1), EXTRACT_F(extent, 2)); if (TEST_NONZERO_F(offsetMask)) { ret = _mm_add_ps(ret, offset); log_info("\tAddress offset of %f, %f, %f added to get %f, " "%f, %f\n", EXTRACT_F(offset, 0), EXTRACT_F(offset, 1), EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1), EXTRACT_F(ret, 2)); } } else { #ifdef __FMA4__ ret = _mm_macc_ps(coord, extent, offset); #elif defined(__FMA__) ret = _mm_fmadd_ps(coord, extent, offset); #else ret = _mm_add_ps(_mm_mul_ps(coord, extent), offset); #endif } } return ret; } #else static float unnormalize_coordinate(const char *name, float coord, float offset, float extent, cl_addressing_mode addressing_mode, int verbose) { float ret = 0.0f; switch (addressing_mode) { case CL_ADDRESS_REPEAT: ret = RepeatNormalizedAddressFn(coord, static_cast(extent)); if (verbose) { log_info("\tRepeat filter denormalizes %s (%f) to %f\n", name, coord, ret); } if (offset != 0.0) { // Add in the offset, and handle wrapping. ret += offset; if (ret > extent) ret -= extent; if (ret < 0.0) ret += extent; } if (verbose && offset != 0.0f) { log_info("\tAddress offset of %f added to get %f\n", offset, ret); } break; case CL_ADDRESS_MIRRORED_REPEAT: ret = MirroredRepeatNormalizedAddressFn( coord, static_cast(extent)); if (verbose) { log_info( "\tMirrored repeat filter denormalizes %s (%f) to %f\n", name, coord, ret); } if (offset != 0.0) { float temp = ret + offset; if (temp > extent) temp = extent - (temp - extent); ret = fabsf(temp); } if (verbose && offset != 0.0f) { log_info("\tAddress offset of %f added to get %f\n", offset, ret); } break; default: ret = coord * extent; if (verbose) { log_info("\tFilter denormalizes %s to %f (%f * %f)\n", name, ret, coord, extent); } ret += offset; if (verbose && offset != 0.0f) { log_info("\tAddress offset of %f added to get %f\n", offset, ret); } } return ret; } #endif FloatPixel sample_image_pixel_float(void *imageData, image_descriptor *imageInfo, float x, float y, float z, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms) { return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms); } // returns max pixel value of the pixels touched FloatPixel sample_image_pixel_float(void *imageData, image_descriptor *imageInfo, float x, float y, float z, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms, int lod) { return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms, lod); } #ifdef __SSE2__ FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo, float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms, int lod) { AddressFn adFn = sAddressingTable[imageSampler]; FloatPixel returnVal; size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth; size_t slice_pitch_lod = 0, row_pitch_lod = 0; if (imageInfo->num_mip_levels > 1) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D: depth_lod = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height_lod = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; default: width_lod = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; } row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) slice_pitch_lod = row_pitch_lod; else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) slice_pitch_lod = row_pitch_lod * height_lod; } else { slice_pitch_lod = imageInfo->slicePitch; row_pitch_lod = imageInfo->rowPitch; } if (containsDenorms) *containsDenorms = 0; __m128 coord, addressOffset, extentf; __m128i extent, addressMask; switch (imageInfo->type) { // The image array types require special care: case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: case CL_MEM_OBJECT_IMAGE1D_ARRAY: coord = _mm_set_ss(x); addressOffset = _mm_set_ss(xAddressOffset); extent = _mm_set_epi32(0, 1, 1, width_lod); addressMask = _mm_bsrli_si128(_mm_setmone_si128(), 12); // = 0, 0, 0, -1 break; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: coord = _mm_set_ps(0.f, 0.f, y, x); addressOffset = _mm_set_ps(0.f, 0.f, yAddressOffset, xAddressOffset); extent = _mm_set_epi32(0, 1, height_lod, width_lod); addressMask = _mm_bsrli_si128(_mm_setmone_si128(), 8); // = 0, 0, -1, -1 break; // Everybody else: default: coord = _mm_set_ps(0.f, z, y, x); addressOffset = _mm_set_ps(0.f, zAddressOffset, yAddressOffset, xAddressOffset); extent = _mm_set_epi32(0, depth_lod, height_lod, width_lod); addressMask = _mm_bsrli_si128(_mm_setmone_si128(), 4); // = 0, -1, -1, -1 } extentf = _mm_cvtepi32_ps(extent); if (imageSampler->normalized_coords) { // We need to unnormalize our coordinates differently depending on // the image type, but 'x' is always processed the same way. const char *name = NULL; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: case CL_MEM_OBJECT_IMAGE1D_ARRAY: name = "x"; break; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: name = "x, y"; break; default: name = "x, y, z"; } coord = unnormalize_coordinate(name, coord, addressOffset, extentf, imageSampler->addressing_mode, verbose); } else if (verbose) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: log_info("Starting coordinate: %f, array index %f\n", x, y); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: log_info("Starting coordinate: %f, %f, array index %f\n", x, y, z); break; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: log_info("Starting coordinate: %f\b", x); break; case CL_MEM_OBJECT_IMAGE2D: log_info("Starting coordinate: %f, %f\n", x, y); break; case CL_MEM_OBJECT_IMAGE3D: default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z); } } // At this point, we have unnormalized coordinates. if (imageSampler->filter_mode == CL_FILTER_NEAREST) { __m128i icoord; int arrayIndex; // We apply the addressing function to the now-unnormalized // coordinates. Note that the array cases again require special // care, per section 8.4 in the OpenCL 1.2 Specification. icoord = _mm_and_si128(addressMask, adFn(vifloorf(coord), extent)); switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: arrayIndex = static_cast(calculate_array_index( y, (float)(imageInfo->arraySize - 1))); if (verbose) log_info("\tArray index %f evaluates to %d\n", y, arrayIndex); #ifdef __SSE4_1__ icoord = _mm_insert_epi32(icoord, arrayIndex, 1); #else icoord = _mm_insert_epi16( _mm_insert_epi16(icoord, (short)arrayIndex, 2), (short)(arrayIndex >> 16), 3); #endif break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: arrayIndex = static_cast(calculate_array_index( z, (float)(imageInfo->arraySize - 1))); if (verbose) log_info("\tArray index %f evaluates to %d\n", z, arrayIndex); #ifdef __SSE4_1__ icoord = _mm_insert_epi32(icoord, arrayIndex, 2); #else icoord = _mm_insert_epi16( _mm_insert_epi16(icoord, (short)arrayIndex, 4), (short)(arrayIndex >> 16), 5); #endif break; default: break; } if (verbose) { if (depth_lod) log_info( "\tReference integer coords calculated: { %d, %d, %d }\n", EXTRACT_I(icoord, 0), EXTRACT_I(icoord, 1), EXTRACT_I(icoord, 2)); else log_info("\tReference integer coords calculated: { %d, %d }\n", EXTRACT_I(icoord, 0), EXTRACT_I(icoord, 1)); } // SSE has an FTZ mode that will be useful here unsigned int mxcsr = 0; if (NULL == containsDenorms) { mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON); } __m128 outPixel = check_for_denorms( read_image_pixel_float(imageData, imageInfo, icoord, lod), containsDenorms); if (NULL == containsDenorms) _mm_setcsr(mxcsr); _mm_storeu_ps(outData, outPixel); _mm_storeu_ps(returnVal.p, vfabsf(outPixel)); return returnVal; } else { // Linear filtering cases. // Image arrays can use 2D filtering, but require us to walk into the // image a certain number of slices before reading. if (depth_lod == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { float array_index = 0; size_t layer_offset = 0; if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { array_index = calculate_array_index(z, (float)(imageInfo->arraySize - 1)); layer_offset = slice_pitch_lod * (size_t)array_index; } else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { array_index = calculate_array_index(y, (float)(imageInfo->arraySize - 1)); layer_offset = slice_pitch_lod * (size_t)array_index; } __m128i icoord = vifloorf(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.f, 0.5f, 0.5f))); __m128i coord00 = _mm_and_si128(addressMask, adFn(icoord, extent)); __m128i coord11 = _mm_and_si128( addressMask, adFn(_mm_sub_epi32(icoord, _mm_setmone_si128()), extent)); if (verbose) { log_info("\tActual integer coords used (i = floor(x-.5)): i0:{ " "%d, %d } and i1:{ %d, %d }\n", EXTRACT_I(coord00, 0), EXTRACT_I(coord00, 1), EXTRACT_I(coord11, 0), EXTRACT_I(coord11, 1)); log_info("\tArray coordinate is %f\n", array_index); } // Walk to beginning of the 'correct' slice, if needed. char *imgPtr = ((char *)imageData) + layer_offset; // flush subnormal results to zero if necessary // SSE has an FTZ mode that will be useful here unsigned int mxcsr; if (NULL == containsDenorms) { mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON); } // Make coordinate vectors for pixels 01 and 10 #ifdef __AVX2__ __m128i coord01 = _mm_blend_epi32(coord00, coord11, 0x01); __m128i coord10 = _mm_blend_epi32(coord00, coord11, 0x02); #elif defined(__SSE4_1__) __m128i coord01 = _mm_blend_epi16(coord00, coord11, 0x03); __m128i coord10 = _mm_blend_epi16(coord00, coord11, 0x0C); #else __m128i coordMask = _mm_bsrli_si128(_mm_setmone_si128(), 8); // = 0, 0, -1, -1 __m128i interleavedCoord = _mm_unpacklo_epi32(coord00, coord11); // y2 y1 x2 x1 __m128i coord01 = _mm_and_si128( coordMask, _mm_shuffle_epi32(interleavedCoord, _MM_SHUFFLE(3, 0, 2, 1))); __m128i coord10 = _mm_and_si128( coordMask, _mm_shuffle_epi32(interleavedCoord, _MM_SHUFFLE(2, 1, 3, 0))); #endif __m128 upLeft, upRight, lowLeft, lowRight; __m128 maxUp, maxLow; upLeft = check_for_denorms( read_image_pixel_float(imgPtr, imageInfo, coord00, lod), containsDenorms); upRight = check_for_denorms( read_image_pixel_float(imgPtr, imageInfo, coord01, lod), containsDenorms); maxUp = pixelMax(upLeft, upRight); lowLeft = check_for_denorms( read_image_pixel_float(imgPtr, imageInfo, coord10, lod), containsDenorms); lowRight = check_for_denorms( read_image_pixel_float(imgPtr, imageInfo, coord11, lod), containsDenorms); maxLow = pixelMax(lowLeft, lowRight); _mm_storeu_ps(returnVal.p, pixelMax(maxUp, maxLow)); if (verbose) { if (NULL == containsDenorms) log_info("\tSampled pixels (rgba order, denorms flushed to " "zero):\n"); else log_info("\tSampled pixels (rgba order):\n"); log_info("\t\tp00: %f, %f, %f, %f\n", EXTRACT_F(upLeft, 0), EXTRACT_F(upLeft, 1), EXTRACT_F(upLeft, 2), EXTRACT_F(upLeft, 3)); log_info("\t\tp01: %f, %f, %f, %f\n", EXTRACT_F(upRight, 0), EXTRACT_F(upRight, 1), EXTRACT_F(upRight, 2), EXTRACT_F(upRight, 3)); log_info("\t\tp10: %f, %f, %f, %f\n", EXTRACT_F(lowLeft, 0), EXTRACT_F(lowLeft, 1), EXTRACT_F(lowLeft, 2), EXTRACT_F(lowLeft, 3)); log_info("\t\tp11: %f, %f, %f, %f\n", EXTRACT_F(lowRight, 0), EXTRACT_F(lowRight, 1), EXTRACT_F(lowRight, 2), EXTRACT_F(lowRight, 3)); } __m128 fracCoord = frac(_mm_sub_ps(coord, _mm_set1_ps(0.5f))); if (verbose) log_info("\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f\n", EXTRACT_F(fracCoord, 0), EXTRACT_F(fracCoord, 1)); #ifdef __AVX__ __m256d alphaBeta = _mm256_cvtps_pd(fracCoord); // x x b a alphaBeta = _mm256_insertf128_pd( alphaBeta, _mm_sub_pd(_mm_set1_pd(1.0), _mm256_castpd256_pd128(alphaBeta)), 1); // 1-b 1-a b a // 1-b 1-a b a // 1-a b a 1-b (2 1 0 3) //(1-a)(1-b) (1-a)b a*b a(1-b) // 00 10 11 01 #ifdef __AVX2__ __m256d weights = _mm256_mul_pd( alphaBeta, _mm256_permute4x64_pd(alphaBeta, _MM_SHUFFLE(2, 1, 0, 3))); __m256d weight01 = _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights)); __m256d weight11 = _mm256_permute4x64_pd(weights, _MM_SHUFFLE(1, 1, 1, 1)); __m256d weight10 = _mm256_permute4x64_pd(weights, _MM_SHUFFLE(2, 2, 2, 2)); __m256d weight00 = _mm256_permute4x64_pd(weights, _MM_SHUFFLE(3, 3, 3, 3)); #else // This is now more complicated... // Swap the two halves of the vector (1 0 3 2): // 0x01 = 0b00000001 // ~~ ~~ // +-----+ | // +---*----------+ // | + // v v // +-+ +-+ // 3 2 1 0 // ...then shuffle the elements as follows: // 0x05 = 0b00000101 // +----------+||| // | +---------+|| // | | +----+| // | | | +---+ // v v v v // 3 2 1 0 1 0 3 2 __m256d weights = _mm256_mul_pd( alphaBeta, _mm256_shuffle_pd( alphaBeta, _mm256_permute2f128_pd(alphaBeta, alphaBeta, 0x01), 0x05)); // Duplicate the even and odd elements... __m256d weight1001 = _mm256_movedup_pd(weights); __m256d weight0011 = _mm256_permute_pd(weights, 0x0F); // ...then duplicate the low and high halves of the results __m256d weight01 = _mm256_permute2f128_pd(_mm256_undefined_pd(), weight1001, 0x22); __m256d weight11 = _mm256_permute2f128_pd(_mm256_undefined_pd(), weight0011, 0x22); __m256d weight10 = _mm256_permute2f128_pd(_mm256_undefined_pd(), weight1001, 0x33); __m256d weight00 = _mm256_permute2f128_pd(_mm256_undefined_pd(), weight0011, 0x33); #endif #ifdef __FMA4__ // Doing it this way instead of using a chain of FMAs avoids stalls _mm_storeu_ps( outData, _mm256_cvtpd_ps(_mm256_add_pd( _mm256_macc_pd( _mm256_cvtps_pd(upLeft), weight00, _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)), _mm256_macc_pd( _mm256_cvtps_pd(lowLeft), weight10, _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11))))); #elif defined(__FMA__) _mm_storeu_ps( outData, _mm256_cvtpd_ps(_mm256_add_pd( _mm256_fmadd_pd( _mm256_cvtps_pd(upLeft), weight00, _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)), _mm256_fmadd_pd( _mm256_cvtps_pd(lowLeft), weight10, _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11))))); #else // No VDPPD for 256-bit vectors... :/ _mm_storeu_ps( outData, _mm256_cvtpd_ps(_mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(_mm256_cvtps_pd(upLeft), weight00), _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)), _mm256_add_pd( _mm256_mul_pd(_mm256_cvtps_pd(lowLeft), weight10), _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11))))); #endif #else __m128d alphaBeta = _mm_cvtps_pd(fracCoord); // b a __m128d invAlphaBeta = _mm_sub_pd(_mm_set1_pd(1.0), alphaBeta); // 1-b 1-a __m128d weights[2]; // <1-a b> * <1-b 1-a> = <(1-a)(1-b) (1-a)b> <00 10> weights[0] = _mm_mul_pd( _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(1, 0)), invAlphaBeta); // * = <11 01> weights[1] = _mm_mul_pd( alphaBeta, _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(0, 1))); __m128d upLeftL = _mm_cvtps_pd(upLeft); __m128d upLeftH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeft)); __m128d upRightL = _mm_cvtps_pd(upRight); __m128d upRightH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRight)); __m128d lowLeftL = _mm_cvtps_pd(lowLeft); __m128d lowLeftH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeft)); __m128d lowRightL = _mm_cvtps_pd(lowRight); __m128d lowRightH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRight)); #ifdef __SSE4_1__ // In the immediate bytes, the high nibble determines which // multiplies take place--in this case, both of them--and the low // nibble determines which lines receive the sum--bit 0 for the low // lane, bit 1 for the high. __m128d rg = _mm_or_pd( _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftL, upLeftL), 0x31), _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftL, upLeftL), 0x32)); __m128d ba = _mm_or_pd( _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftH, upLeftH), 0x31), _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftH, upLeftH), 0x32)); rg = _mm_add_pd( rg, _mm_or_pd(_mm_dp_pd(weights[1], _mm_unpacklo_pd(upRightL, lowRightL), 0x31), _mm_dp_pd(weights[1], _mm_unpackhi_pd(upRightL, lowRightL), 0x32))); ba = _mm_add_pd( ba, _mm_or_pd(_mm_dp_pd(weights[1], _mm_unpacklo_pd(upRightH, lowRightH), 0x31), _mm_dp_pd(weights[1], _mm_unpackhi_pd(upRightH, lowRightH), 0x32))); #else #ifdef __SSE3__ __m128d weight10 = _mm_movedup_pd(weights[0]); __m128d weight01 = _mm_movedup_pd(weights[1]); #else __m128d weight10 = _mm_unpacklo_pd(weights[0], weights[0]); __m128d weight01 = _mm_unpacklo_pd(weights[1], weights[1]); #endif __m128d rg = _mm_add_pd(_mm_mul_pd(weight01, upRightL), _mm_mul_pd(weight10, lowLeftL)); __m128d ba = _mm_add_pd(_mm_mul_pd(weight01, upRightH), _mm_mul_pd(weight10, lowLeftH)); __m128d weight00 = _mm_unpackhi_pd(weights[0], weights[0]); __m128d weight11 = _mm_unpackhi_pd(weights[1], weights[1]); rg = _mm_add_pd(rg, _mm_add_pd(_mm_mul_pd(weight00, upLeftL), _mm_mul_pd(weight11, lowRightL))); ba = _mm_add_pd(ba, _mm_add_pd(_mm_mul_pd(weight00, upLeftH), _mm_mul_pd(weight11, lowRightH))); #endif _mm_storeu_ps(outData, _mm_movelh_ps(_mm_cvtpd_ps(rg), _mm_cvtpd_ps(ba))); #endif if (NULL == containsDenorms) _mm_setcsr(mxcsr); } else { // 3D linear filtering __m128i icoord = vifloorf(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.5f, 0.5f, 0.5f))); __m128i coord000 = _mm_and_si128(addressMask, adFn(icoord, extent)); __m128i coord111 = _mm_and_si128( addressMask, adFn(_mm_sub_epi32(icoord, _mm_setmone_si128()), extent)); if (verbose) log_info("\tActual integer coords used (i = floor(x-.5)): " "i0:{%d, %d, %d} and i1:{%d, %d, %d}\n", EXTRACT_I(coord000, 0), EXTRACT_I(coord000, 1), EXTRACT_I(coord000, 2), EXTRACT_I(coord111, 0), EXTRACT_I(coord111, 1), EXTRACT_I(coord111, 2)); // flush subnormal results to zero if necessary // SSE has an FTZ mode that will be useful here unsigned int mxcsr; if (NULL == containsDenorms) { mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON); } #ifdef __AVX2__ __m128i coord001 = _mm_blend_epi32(coord000, coord111, 0x01); __m128i coord010 = _mm_blend_epi32(coord000, coord111, 0x02); __m128i coord011 = _mm_blend_epi32(coord000, coord111, 0x03); __m128i coord100 = _mm_blend_epi32(coord000, coord111, 0x04); __m128i coord101 = _mm_blend_epi32(coord000, coord111, 0x05); __m128i coord110 = _mm_blend_epi32(coord000, coord111, 0x06); #elif defined(__SSE4_1__) __m128i coord001 = _mm_blend_epi16(coord000, coord111, 0x03); __m128i coord010 = _mm_blend_epi16(coord000, coord111, 0x0C); __m128i coord011 = _mm_blend_epi16(coord000, coord111, 0x0F); __m128i coord100 = _mm_blend_epi16(coord000, coord111, 0x30); __m128i coord101 = _mm_blend_epi16(coord000, coord111, 0x33); __m128i coord110 = _mm_blend_epi16(coord000, coord111, 0x3C); #else // XXX This is horrible without PBLEND... __m128i negOne = _mm_setmone_si128(); __m128i coordMask = _mm_bsrli_si128(negOne, 8); // = 0, 0, -1, -1 __m128i coord011 = SELECT_I(coordMask, coord000, coord111); coordMask = _mm_bsrli_si128(coordMask, 4); // = 0, 0, 0, -1 __m128i coord001 = SELECT_I(coordMask, coord000, coord111); coordMask = _mm_slli_epi64(coordMask, 32); // = 0, 0, -1, 0 __m128i coord010 = SELECT_I(coordMask, coord000, coord111); coordMask = _mm_srli_epi64(negOne, 32); // = 0, -1, 0, -1 __m128i coord101 = SELECT_I(coordMask, coord000, coord111); coordMask = _mm_bslli_si128(coordMask, 8); // = 0, -1, 0, 0 __m128i coord100 = SELECT_I(coordMask, coord000, coord111); coordMask = _mm_bslli_si128(_mm_bsrli_si128(negOne, 4), 4); // = 0, -1, -1, 0 __m128i coord110 = SELECT_I(coordMask, coord000, coord111); #endif __m128 upLeftA, upRightA, lowLeftA, lowRightA; __m128 upLeftB, upRightB, lowLeftB, lowRightB; __m128 pixelMaxA, pixelMaxB, pixelMaxC; upLeftA = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord000, lod), containsDenorms); upRightA = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord001, lod), containsDenorms); pixelMaxA = pixelMax(upLeftA, upRightA); lowLeftA = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord010, lod), containsDenorms); lowRightA = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord011, lod), containsDenorms); pixelMaxB = pixelMax(lowLeftA, lowRightA); pixelMaxC = pixelMax(pixelMaxA, pixelMaxB); upLeftB = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord100, lod), containsDenorms); upRightB = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord101, lod), containsDenorms); pixelMaxA = pixelMax(upLeftB, upRightB); lowLeftB = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord110, lod), containsDenorms); lowRightB = check_for_denorms( read_image_pixel_float(imageData, imageInfo, coord111, lod), containsDenorms); pixelMaxB = pixelMax(lowLeftB, lowRightB); pixelMaxA = pixelMax(pixelMaxA, pixelMaxB); _mm_storeu_ps(returnVal.p, pixelMax(pixelMaxA, pixelMaxC)); if (verbose) { if (NULL == containsDenorms) log_info("\tSampled pixels (rgba order, denorms flushed to " "zero):\n"); else log_info("\tSampled pixels (rgba order):\n"); log_info("\t\tp000: %f, %f, %f, %f\n", EXTRACT_F(upLeftA, 0), EXTRACT_F(upLeftA, 1), EXTRACT_F(upLeftA, 2), EXTRACT_F(upLeftA, 3)); log_info("\t\tp001: %f, %f, %f, %f\n", EXTRACT_F(upRightA, 0), EXTRACT_F(upRightA, 1), EXTRACT_F(upRightA, 2), EXTRACT_F(upRightA, 3)); log_info("\t\tp010: %f, %f, %f, %f\n", EXTRACT_F(lowLeftA, 0), EXTRACT_F(lowLeftA, 1), EXTRACT_F(lowLeftA, 2), EXTRACT_F(lowLeftA, 3)); log_info("\t\tp011: %f, %f, %f, %f\n\n", EXTRACT_F(lowRightA, 0), EXTRACT_F(lowRightA, 1), EXTRACT_F(lowRightA, 2), EXTRACT_F(lowRightA, 3)); log_info("\t\tp100: %f, %f, %f, %f\n", EXTRACT_F(upLeftB, 0), EXTRACT_F(upLeftB, 1), EXTRACT_F(upLeftB, 2), EXTRACT_F(upLeftB, 3)); log_info("\t\tp101: %f, %f, %f, %f\n", EXTRACT_F(upRightB, 0), EXTRACT_F(upRightB, 1), EXTRACT_F(upRightB, 2), EXTRACT_F(upRightB, 3)); log_info("\t\tp110: %f, %f, %f, %f\n", EXTRACT_F(lowLeftB, 0), EXTRACT_F(lowLeftB, 1), EXTRACT_F(lowLeftB, 2), EXTRACT_F(lowLeftB, 3)); log_info("\t\tp111: %f, %f, %f, %f\n", EXTRACT_F(lowRightB, 0), EXTRACT_F(lowRightB, 1), EXTRACT_F(lowRightB, 2), EXTRACT_F(lowRightB, 3)); } __m128 fracCoord = frac(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.5f, 0.5f, 0.5f))); if (verbose) log_info("\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f, " "frac( z - 0.5f ) = %f\n", EXTRACT_F(fracCoord, 0), EXTRACT_F(fracCoord, 1), EXTRACT_F(fracCoord, 2)); #ifdef __AVX__ __m256d alphaBetaGamma = _mm256_cvtps_pd(fracCoord); // x g b a __m256d invABG = _mm256_sub_pd(_mm256_set1_pd(1.0), alphaBetaGamma); // x 1-g 1-b 1-a __m256d alphaBeta = _mm256_permute2f128_pd(alphaBetaGamma, invABG, 0x20); __m256d weights[2][2][2]; // 1-g 1-g 1-g 1-g // 1-b 1-a b a // 1-a b a 1-b // a(1-b)(1-g) (1-a)b(1-g) a*b(1-g) a(1-b)(1-g) // 000 010 011 001 // g g g g // 1-b 1-a b a // 1-a b a 1-b // (1-a)(1-b)g (1-a)b*g a*b*g a(1-b)g // 100 110 111 101 #ifdef __AVX2__ __m256d invGamma = _mm256_permute4x64_pd(invABG, _MM_SHUFFLE(2, 2, 2, 2)); weights[1][0][0] = _mm256_mul_pd( alphaBeta, _mm256_permute4x64_pd(alphaBeta, _MM_SHUFFLE(2, 1, 0, 3))); weights[0][0][0] = _mm256_mul_pd(weights[1][0][0], invGamma); __m256d gamma = _mm256_permute4x64_pd(alphaBetaGamma, _MM_SHUFFLE(2, 2, 2, 2)); weights[1][0][0] = _mm256_mul_pd(weights[1][0][0], gamma); weights[0][0][1] = _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights[0][0][0])); weights[0][1][1] = _mm256_permute4x64_pd(weights[0][0][0], _MM_SHUFFLE(1, 1, 1, 1)); weights[1][0][1] = _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights[1][0][0])); weights[1][1][1] = _mm256_permute4x64_pd(weights[1][0][0], _MM_SHUFFLE(1, 1, 1, 1)); weights[0][1][0] = _mm256_permute4x64_pd(weights[0][0][0], _MM_SHUFFLE(2, 2, 2, 2)); weights[0][0][0] = _mm256_permute4x64_pd(weights[0][0][0], _MM_SHUFFLE(3, 3, 3, 3)); weights[1][1][0] = _mm256_permute4x64_pd(weights[1][0][0], _MM_SHUFFLE(2, 2, 2, 2)); weights[1][0][0] = _mm256_permute4x64_pd(weights[1][0][0], _MM_SHUFFLE(3, 3, 3, 3)); #else // Much like before, we must permute the elements of alphaBeta to // get them in the form we need __m256d invGamma = _mm256_permute2f128( _mm256_undefined_pd(), _mm256_movedup_pd(invABG), 0x33); weights[1][0][0] = _mm_mul_pd( alphaBeta, _mm256_shuffle_pd( alphaBeta, _mm256_permute2f128_pd(alphaBeta, alphaBeta, 0x01), 0x05)); weights[0][0][0] = _mm_mul_pd(weights[1], invGamma); __m256d gamma = _mm256_permute2f128( _mm256_undefined_pd(), _mm256_movedup_pd(alphaBetaGamma), 0x33); weights[1][0][0] = _mm_mul_pd(weights[1], gamma); weights[0][1][0] = _mm256_movedup_pd(weights[0][0][0]); weights[0][0][0] = _mm256_permute_pd(weights[0][0][0], 0x0F); weights[1][1][0] = _mm256_movedup_pd(weights[1][0][0]); weights[1][0][0] = _mm256_permute_pd(weights[1][0][0], 0x0F); weights[0][0][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[0][1][0], 0x22); weights[0][1][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[0][0][0], 0x22); weights[1][0][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[1][1][0], 0x22); weights[1][1][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[1][0][0], 0x22); weights[0][1][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[0][1][0], 0x33); weights[0][0][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[0][0][0], 0x33); weights[1][1][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[1][1][0], 0x33); weights[1][0][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(), weights[1][0][0], 0x33); #endif #ifdef __FMA4__ _mm_storeu_ps( outData, _mm256_cvtpd_ps(_mm256_add_pd( _mm256_add_pd( _mm256_macc_pd(_mm256_cvtps_pd(upLeftA), weights[0][0][0], _mm256_mul_pd(_mm256_cvtps_pd(upRightA), weights[0][0][1])), _mm256_macc_pd(_mm256_cvtps_pd(lowLeftA), weights[0][1][0], _mm256_mul_pd(_mm256_cvtps_pd(lowRightA), weights[0][1][1]))), _mm256_add_pd( _mm256_macc_pd(_mm256_cvtps_pd(upLeftB), weights[1][0][0], _mm256_mul_pd(_mm256_cvtps_pd(upRightB), weights[1][0][1])), _mm256_macc_pd(_mm256_cvtps_pd(lowLeftB), weights[1][1][0], _mm256_mul_pd(_mm256_cvtps_pd(lowRightB), weights[1][1][1])))))); #elif defined(__FMA__) _mm_storeu_ps( outData, _mm256_cvtpd_ps(_mm256_add_pd( _mm256_add_pd( _mm256_fmadd_pd(_mm256_cvtps_pd(upLeftA), weights[0][0][0], _mm256_mul_pd(_mm256_cvtps_pd(upRightA), weights[0][0][1])), _mm256_fmadd_pd( _mm256_cvtps_pd(lowLeftA), weights[0][1][0], _mm256_mul_pd(_mm256_cvtps_pd(lowRightA), weights[0][1][1]))), _mm256_add_pd( _mm256_fmadd_pd(_mm256_cvtps_pd(upLeftB), weights[1][0][0], _mm256_mul_pd(_mm256_cvtps_pd(upRightB), weights[1][0][1])), _mm256_fmadd_pd( _mm256_cvtps_pd(lowLeftB), weights[1][1][0], _mm256_mul_pd(_mm256_cvtps_pd(lowRightB), weights[1][1][1])))))); #else _mm_storeu_ps( outData, _mm256_cvtpd_ps(_mm256_add_pd( _mm256_add_pd( _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(upLeftA), weights[0][0][0]), _mm256_mul_pd(_mm256_cvtps_pd(upRightA), weights[0][0][1])), _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(lowLeftA), weights[0][1][0]), _mm256_mul_pd(_mm256_cvtps_pd(lowRightA), weights[0][1][1]))), _mm256_add_pd( _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(upLeftB), weights[1][0][0]), _mm256_mul_pd(_mm256_cvtps_pd(upRightB), weights[1][0][1])), _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(lowLeftB), weights[1][1][0]), _mm256_mul_pd(_mm256_cvtps_pd(lowRightB), weights[1][1][1])))))); #endif #else __m128d alphaBeta = _mm_cvtps_pd(fracCoord); // b a __m128d invAlphaBeta = _mm_sub_pd(_mm_set1_pd(1.0), alphaBeta); // 1-b 1-a __m128d gamma = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), fracCoord)); #ifdef __SSE3__ gamma = _mm_movedup_pd(gamma); #else gamma = _mm_unpacklo_pd(gamma, gamma); #endif __m128d invGamma = _mm_sub_pd(_mm_set1_pd(1.0), gamma); __m128d weights[4]; // <1-a b> * <1-b 1-a> = <(1-a)(1-b) (1-a)b> <00 10> weights[0] = _mm_mul_pd( _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(1, 0)), invAlphaBeta); // * = <11 01> weights[1] = _mm_mul_pd( alphaBeta, _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(0, 1))); weights[2] = _mm_mul_pd(weights[0], gamma); weights[3] = _mm_mul_pd(weights[1], gamma); weights[0] = _mm_mul_pd(weights[0], invGamma); weights[1] = _mm_mul_pd(weights[1], invGamma); __m128d upLeftAL = _mm_cvtps_pd(upLeftA); __m128d upLeftAH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeftA)); __m128d upLeftBL = _mm_cvtps_pd(upLeftB); __m128d upLeftBH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeftB)); __m128d upRightAL = _mm_cvtps_pd(upRightA); __m128d upRightAH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRightA)); __m128d upRightBL = _mm_cvtps_pd(upRightB); __m128d upRightBH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRightB)); __m128d lowLeftAL = _mm_cvtps_pd(lowLeftA); __m128d lowLeftAH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeftA)); __m128d lowLeftBL = _mm_cvtps_pd(lowLeftB); __m128d lowLeftBH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeftB)); __m128d lowRightAL = _mm_cvtps_pd(lowRightA); __m128d lowRightAH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRightA)); __m128d lowRightBL = _mm_cvtps_pd(lowRightB); __m128d lowRightBH = _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRightB)); #ifdef __SSE4_1__ __m128d rg = _mm_or_pd( _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftAL, upLeftAL), 0x31), _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftAL, upLeftAL), 0x32)); __m128d ba = _mm_or_pd( _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftAH, upLeftAH), 0x31), _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftAH, upLeftAH), 0x32)); rg = _mm_add_pd( rg, _mm_or_pd( _mm_dp_pd(weights[1], _mm_unpacklo_pd(upRightAL, lowRightAL), 0x31), _mm_dp_pd(weights[1], _mm_unpackhi_pd(upRightAL, lowRightAL), 0x32))); ba = _mm_add_pd( ba, _mm_or_pd( _mm_dp_pd(weights[1], _mm_unpacklo_pd(upRightAH, lowRightAH), 0x31), _mm_dp_pd(weights[1], _mm_unpackhi_pd(upRightAH, lowRightAH), 0x32))); rg = _mm_add_pd( rg, _mm_or_pd(_mm_dp_pd(weights[2], _mm_unpacklo_pd(lowLeftBL, upLeftBL), 0x31), _mm_dp_pd(weights[2], _mm_unpackhi_pd(lowLeftBL, upLeftBL), 0x32))); ba = _mm_add_pd( ba, _mm_or_pd(_mm_dp_pd(weights[2], _mm_unpacklo_pd(lowLeftBH, upLeftBH), 0x31), _mm_dp_pd(weights[2], _mm_unpackhi_pd(lowLeftBH, upLeftBH), 0x32))); rg = _mm_add_pd( rg, _mm_or_pd( _mm_dp_pd(weights[3], _mm_unpacklo_pd(upRightBL, lowRightBL), 0x31), _mm_dp_pd(weights[3], _mm_unpackhi_pd(upRightBL, lowRightBL), 0x32))); ba = _mm_add_pd( ba, _mm_or_pd( _mm_dp_pd(weights[3], _mm_unpacklo_pd(upRightBH, lowRightBH), 0x31), _mm_dp_pd(weights[3], _mm_unpackhi_pd(upRightBH, lowRightBH), 0x32))); #else #ifdef __SSE3__ __m128d weight010 = _mm_movedup_pd(weights[0]); __m128d weight001 = _mm_movedup_pd(weights[1]); #else __m128d weight010 = _mm_unpacklo_pd(weights[0], weights[0]); __m128d weight001 = _mm_unpacklo_pd(weights[1], weights[1]); #endif __m128d rg = _mm_add_pd(_mm_mul_pd(weight001, upRightAL), _mm_mul_pd(weight010, lowLeftAL)); __m128d ba = _mm_add_pd(_mm_mul_pd(weight001, upRightAH), _mm_mul_pd(weight010, lowLeftAH)); __m128d weight000 = _mm_unpackhi_pd(weights[0], weights[0]); __m128d weight011 = _mm_unpackhi_pd(weights[1], weights[1]); rg = _mm_add_pd(rg, _mm_add_pd(_mm_mul_pd(weight000, upLeftAL), _mm_mul_pd(weight011, lowRightAL))); ba = _mm_add_pd(ba, _mm_add_pd(_mm_mul_pd(weight000, upLeftAH), _mm_mul_pd(weight011, lowRightAH))); #ifdef __SSE3__ __m128d weight110 = _mm_movedup_pd(weights[2]); __m128d weight101 = _mm_movedup_pd(weights[3]); #else __m128d weight110 = _mm_unpacklo_pd(weights[2], weights[2]); __m128d weight101 = _mm_unpacklo_pd(weights[3], weights[3]); #endif rg = _mm_add_pd(rg, _mm_add_pd(_mm_mul_pd(weight101, upRightBL), _mm_mul_pd(weight110, lowLeftBL))); ba = _mm_add_pd(rg, _mm_add_pd(_mm_mul_pd(weight101, upRightBH), _mm_mul_pd(weight110, lowLeftBH))); __m128d weight100 = _mm_unpackhi_pd(weights[2], weights[2]); __m128d weight111 = _mm_unpackhi_pd(weights[3], weights[3]); rg = _mm_add_pd(rg, _mm_add_pd(_mm_mul_pd(weight100, upLeftBL), _mm_mul_pd(weight111, lowRightBL))); ba = _mm_add_pd(ba, _mm_add_pd(_mm_mul_pd(weight100, upLeftBH), _mm_mul_pd(weight111, lowRightBH))); #endif _mm_storeu_ps(outData, _mm_movelh_ps(_mm_cvtpd_ps(rg), _mm_cvtpd_ps(ba))); #endif if (NULL == containsDenorms) _mm_setcsr(mxcsr); } return returnVal; } } #else FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo, float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms, int lod) { AddressFn adFn = sAddressingTable[imageSampler]; FloatPixel returnVal; size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth; size_t slice_pitch_lod = 0, row_pitch_lod = 0; if (imageInfo->num_mip_levels > 1) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D: depth_lod = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height_lod = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; default: width_lod = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; } row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) slice_pitch_lod = row_pitch_lod; else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) slice_pitch_lod = row_pitch_lod * height_lod; } else { slice_pitch_lod = imageInfo->slicePitch; row_pitch_lod = imageInfo->rowPitch; } if (containsDenorms) *containsDenorms = 0; if (imageSampler->normalized_coords) { // We need to unnormalize our coordinates differently depending on // the image type, but 'x' is always processed the same way. x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod, imageSampler->addressing_mode, verbose); switch (imageInfo->type) { // The image array types require special care: case CL_MEM_OBJECT_IMAGE1D_ARRAY: z = 0; // don't care -- unused for 1D arrays break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: y = unnormalize_coordinate( "y", y, yAddressOffset, (float)height_lod, imageSampler->addressing_mode, verbose); break; // Everybody else: default: y = unnormalize_coordinate( "y", y, yAddressOffset, (float)height_lod, imageSampler->addressing_mode, verbose); z = unnormalize_coordinate( "z", z, zAddressOffset, (float)depth_lod, imageSampler->addressing_mode, verbose); } } else if (verbose) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: log_info("Starting coordinate: %f, array index %f\n", x, y); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: log_info("Starting coordinate: %f, %f, array index %f\n", x, y, z); break; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: log_info("Starting coordinate: %f\n", x); break; case CL_MEM_OBJECT_IMAGE2D: log_info("Starting coordinate: %f, %f\n", x, y); break; case CL_MEM_OBJECT_IMAGE3D: default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z); } } // At this point, we have unnormalized coordinates. if (imageSampler->filter_mode == CL_FILTER_NEAREST) { int ix, iy, iz; // We apply the addressing function to the now-unnormalized // coordinates. Note that the array cases again require special // care, per section 8.4 in the OpenCL 1.2 Specification. ix = adFn(static_cast(floorf(x)), width_lod); switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: iy = static_cast(calculate_array_index( y, (float)(imageInfo->arraySize - 1))); iz = 0; if (verbose) { log_info("\tArray index %f evaluates to %d\n", y, iy); } break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: iy = adFn(static_cast(floorf(y)), height_lod); iz = static_cast(calculate_array_index( z, (float)(imageInfo->arraySize - 1))); if (verbose) { log_info("\tArray index %f evaluates to %d\n", z, iz); } break; default: iy = adFn(static_cast(floorf(y)), height_lod); if (depth_lod != 0) iz = adFn(static_cast(floorf(z)), depth_lod); else iz = 0; } if (verbose) { if (iz) log_info( "\tReference integer coords calculated: { %d, %d, %d }\n", ix, iy, iz); else log_info("\tReference integer coords calculated: { %d, %d }\n", ix, iy); } read_image_pixel_float(imageData, imageInfo, ix, iy, iz, outData, lod); check_for_denorms(outData, containsDenorms); for (int i = 0; i < 4; i++) returnVal.p[i] = fabsf(outData[i]); return returnVal; } else { // Linear filtering cases. size_t width = width_lod, height = height_lod, depth = depth_lod; // Image arrays can use 2D filtering, but require us to walk into the // image a certain number of slices before reading. if (depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { float array_index = 0; size_t layer_offset = 0; if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { array_index = calculate_array_index(z, (float)(imageInfo->arraySize - 1)); layer_offset = slice_pitch_lod * (size_t)array_index; } else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { array_index = calculate_array_index(y, (float)(imageInfo->arraySize - 1)); layer_offset = slice_pitch_lod * (size_t)array_index; // Set up y and height so that the filtering below is correct // 1D filtering on a single slice. height = 1; } int x1 = adFn(static_cast(floorf(x - 0.5f)), width); int y1 = 0; int x2 = adFn(static_cast(floorf(x - 0.5f) + 1), width); int y2 = 0; if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER)) { y1 = adFn(static_cast(floorf(y - 0.5f)), height); y2 = adFn(static_cast(floorf(y - 0.5f) + 1), height); } else { y = 0.5f; } if (verbose) { log_info("\tActual integer coords used (i = floor(x-.5)): i0:{ " "%d, %d } and i1:{ %d, %d }\n", x1, y1, x2, y2); log_info("\tArray coordinate is %f\n", array_index); } // Walk to beginning of the 'correct' slice, if needed. char *imgPtr = ((char *)imageData) + layer_offset; float upLeft[4], upRight[4], lowLeft[4], lowRight[4]; float maxUp[4], maxLow[4]; read_image_pixel_float(imgPtr, imageInfo, x1, y1, 0, upLeft, lod); read_image_pixel_float(imgPtr, imageInfo, x2, y1, 0, upRight, lod); check_for_denorms(upLeft, containsDenorms); check_for_denorms(upRight, containsDenorms); pixelMax(upLeft, upRight, maxUp); read_image_pixel_float(imgPtr, imageInfo, x1, y2, 0, lowLeft, lod); read_image_pixel_float(imgPtr, imageInfo, x2, y2, 0, lowRight, lod); check_for_denorms(lowLeft, containsDenorms); check_for_denorms(lowRight, containsDenorms); pixelMax(lowLeft, lowRight, maxLow); pixelMax(maxUp, maxLow, returnVal.p); if (verbose) { if (NULL == containsDenorms) log_info("\tSampled pixels (rgba order, denorms flushed to " "zero):\n"); else log_info("\tSampled pixels (rgba order):\n"); log_info("\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1], upLeft[2], upLeft[3]); log_info("\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1], upRight[2], upRight[3]); log_info("\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1], lowLeft[2], lowLeft[3]); log_info("\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1], lowRight[2], lowRight[3]); } bool printMe = false; if (x1 <= 0 || x2 <= 0 || x1 >= (int)width - 1 || x2 >= (int)width - 1) printMe = true; if (y1 <= 0 || y2 <= 0 || y1 >= (int)height - 1 || y2 >= (int)height - 1) printMe = true; double weights[2][2]; weights[0][0] = weights[0][1] = 1.0 - frac(x - 0.5f); weights[1][0] = weights[1][1] = frac(x - 0.5f); weights[0][0] *= 1.0 - frac(y - 0.5f); weights[1][0] *= 1.0 - frac(y - 0.5f); weights[0][1] *= frac(y - 0.5f); weights[1][1] *= frac(y - 0.5f); if (verbose) log_info("\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f\n", frac(x - 0.5f), frac(y - 0.5f)); for (int i = 0; i < 3; i++) { outData[i] = (float)((upLeft[i] * weights[0][0]) + (upRight[i] * weights[1][0]) + (lowLeft[i] * weights[0][1]) + (lowRight[i] * weights[1][1])); // flush subnormal results to zero if necessary if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN) outData[i] = copysignf(0.0f, outData[i]); } outData[3] = (float)((upLeft[3] * weights[0][0]) + (upRight[3] * weights[1][0]) + (lowLeft[3] * weights[0][1]) + (lowRight[3] * weights[1][1])); // flush subnormal results to zero if necessary if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN) outData[3] = copysignf(0.0f, outData[3]); } else { // 3D linear filtering int x1 = adFn(static_cast(floorf(x - 0.5f)), width_lod); int y1 = adFn(static_cast(floorf(y - 0.5f)), height_lod); int z1 = adFn(static_cast(floorf(z - 0.5f)), depth_lod); int x2 = adFn(static_cast(floorf(x - 0.5f) + 1), width_lod); int y2 = adFn(static_cast(floorf(y - 0.5f) + 1), height_lod); int z2 = adFn(static_cast(floorf(z - 0.5f) + 1), depth_lod); if (verbose) log_info("\tActual integer coords used (i = floor(x-.5)): " "i0:{%d, %d, %d} and i1:{%d, %d, %d}\n", x1, y1, z1, x2, y2, z2); float upLeftA[4], upRightA[4], lowLeftA[4], lowRightA[4]; float upLeftB[4], upRightB[4], lowLeftB[4], lowRightB[4]; float pixelMaxA[4], pixelMaxB[4]; read_image_pixel_float(imageData, imageInfo, x1, y1, z1, upLeftA, lod); read_image_pixel_float(imageData, imageInfo, x2, y1, z1, upRightA, lod); check_for_denorms(upLeftA, containsDenorms); check_for_denorms(upRightA, containsDenorms); pixelMax(upLeftA, upRightA, pixelMaxA); read_image_pixel_float(imageData, imageInfo, x1, y2, z1, lowLeftA, lod); read_image_pixel_float(imageData, imageInfo, x2, y2, z1, lowRightA, lod); check_for_denorms(lowLeftA, containsDenorms); check_for_denorms(lowRightA, containsDenorms); pixelMax(lowLeftA, lowRightA, pixelMaxB); pixelMax(pixelMaxA, pixelMaxB, returnVal.p); read_image_pixel_float(imageData, imageInfo, x1, y1, z2, upLeftB, lod); read_image_pixel_float(imageData, imageInfo, x2, y1, z2, upRightB, lod); check_for_denorms(upLeftB, containsDenorms); check_for_denorms(upRightB, containsDenorms); pixelMax(upLeftB, upRightB, pixelMaxA); read_image_pixel_float(imageData, imageInfo, x1, y2, z2, lowLeftB, lod); read_image_pixel_float(imageData, imageInfo, x2, y2, z2, lowRightB, lod); check_for_denorms(lowLeftB, containsDenorms); check_for_denorms(lowRightB, containsDenorms); pixelMax(lowLeftB, lowRightB, pixelMaxB); pixelMax(pixelMaxA, pixelMaxB, pixelMaxA); pixelMax(pixelMaxA, returnVal.p, returnVal.p); if (verbose) { if (NULL == containsDenorms) log_info("\tSampled pixels (rgba order, denorms flushed to " "zero):\n"); else log_info("\tSampled pixels (rgba order):\n"); log_info("\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1], upLeftA[2], upLeftA[3]); log_info("\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1], upRightA[2], upRightA[3]); log_info("\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1], lowLeftA[2], lowLeftA[3]); log_info("\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0], lowRightA[1], lowRightA[2], lowRightA[3]); log_info("\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1], upLeftB[2], upLeftB[3]); log_info("\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1], upRightB[2], upRightB[3]); log_info("\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1], lowLeftB[2], lowLeftB[3]); log_info("\t\tp111: %f, %f, %f, %f\n", lowRightB[0], lowRightB[1], lowRightB[2], lowRightB[3]); } double weights[2][2][2]; float a = frac(x - 0.5f), b = frac(y - 0.5f), c = frac(z - 0.5f); weights[0][0][0] = weights[0][1][0] = weights[0][0][1] = weights[0][1][1] = 1.f - a; weights[1][0][0] = weights[1][1][0] = weights[1][0][1] = weights[1][1][1] = a; weights[0][0][0] *= 1.f - b; weights[1][0][0] *= 1.f - b; weights[0][0][1] *= 1.f - b; weights[1][0][1] *= 1.f - b; weights[0][1][0] *= b; weights[1][1][0] *= b; weights[0][1][1] *= b; weights[1][1][1] *= b; weights[0][0][0] *= 1.f - c; weights[0][1][0] *= 1.f - c; weights[1][0][0] *= 1.f - c; weights[1][1][0] *= 1.f - c; weights[0][0][1] *= c; weights[0][1][1] *= c; weights[1][0][1] *= c; weights[1][1][1] *= c; if (verbose) log_info("\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f, " "frac( z - 0.5f ) = %f\n", frac(x - 0.5f), frac(y - 0.5f), frac(z - 0.5f)); for (int i = 0; i < 3; i++) { outData[i] = (float)((upLeftA[i] * weights[0][0][0]) + (upRightA[i] * weights[1][0][0]) + (lowLeftA[i] * weights[0][1][0]) + (lowRightA[i] * weights[1][1][0]) + (upLeftB[i] * weights[0][0][1]) + (upRightB[i] * weights[1][0][1]) + (lowLeftB[i] * weights[0][1][1]) + (lowRightB[i] * weights[1][1][1])); // flush subnormal results to zero if necessary if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN) outData[i] = copysignf(0.0f, outData[i]); } outData[3] = (float)((upLeftA[3] * weights[0][0][0]) + (upRightA[3] * weights[1][0][0]) + (lowLeftA[3] * weights[0][1][0]) + (lowRightA[3] * weights[1][1][0]) + (upLeftB[3] * weights[0][0][1]) + (upRightB[3] * weights[1][0][1]) + (lowLeftB[3] * weights[0][1][1]) + (lowRightB[3] * weights[1][1][1])); // flush subnormal results to zero if necessary if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN) outData[3] = copysignf(0.0f, outData[3]); } return returnVal; } } #endif FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo, float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms) { return sample_image_pixel_float_offset( imageData, imageInfo, x, y, z, xAddressOffset, yAddressOffset, zAddressOffset, imageSampler, outData, verbose, containsDenorms, 0); } int debug_find_vector_in_image(void *imagePtr, image_descriptor *imageInfo, void *vectorToFind, size_t vectorSize, int *outX, int *outY, int *outZ, size_t lod) { int foundCount = 0; char *iPtr = (char *)imagePtr; size_t width; size_t depth; size_t height; size_t row_pitch; size_t slice_pitch; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = 1; depth = 1; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = 1; depth = imageInfo->arraySize; break; case CL_MEM_OBJECT_IMAGE2D: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; depth = 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; depth = imageInfo->arraySize; break; case CL_MEM_OBJECT_IMAGE3D: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; break; } row_pitch = width * get_pixel_size(imageInfo->format); slice_pitch = row_pitch * height; for (size_t z = 0; z < depth; z++) { for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width; x++) { if (memcmp(iPtr, vectorToFind, vectorSize) == 0) { if (foundCount == 0) { *outX = (int)x; if (outY != NULL) *outY = (int)y; if (outZ != NULL) *outZ = (int)z; } foundCount++; } iPtr += vectorSize; } iPtr += row_pitch - (width * vectorSize); } iPtr += slice_pitch - (height * row_pitch); } return foundCount; } int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo, unsigned int *valuesToFind, int *outX, int *outY, int *outZ, int lod) { char vectorToFind[4 * 4]; size_t vectorSize = get_format_channel_count(imageInfo->format); if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8) { unsigned char *p = (unsigned char *)vectorToFind; for (unsigned int i = 0; i < vectorSize; i++) p[i] = (unsigned char)valuesToFind[i]; } else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16) { unsigned short *p = (unsigned short *)vectorToFind; for (unsigned int i = 0; i < vectorSize; i++) p[i] = (unsigned short)valuesToFind[i]; vectorSize *= 2; } else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32) { unsigned int *p = (unsigned int *)vectorToFind; for (unsigned int i = 0; i < vectorSize; i++) p[i] = (unsigned int)valuesToFind[i]; vectorSize *= 4; } else { log_info("WARNING: Unable to search for debug pixel: invalid image " "format\n"); return false; } return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod); } int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo, int *valuesToFind, int *outX, int *outY, int *outZ, int lod) { char vectorToFind[4 * 4]; size_t vectorSize = get_format_channel_count(imageInfo->format); if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT8) { char *p = (char *)vectorToFind; for (unsigned int i = 0; i < vectorSize; i++) p[i] = (char)valuesToFind[i]; } else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT16) { short *p = (short *)vectorToFind; for (unsigned int i = 0; i < vectorSize; i++) p[i] = (short)valuesToFind[i]; vectorSize *= 2; } else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT32) { int *p = (int *)vectorToFind; for (unsigned int i = 0; i < vectorSize; i++) p[i] = (int)valuesToFind[i]; vectorSize *= 4; } else { log_info("WARNING: Unable to search for debug pixel: invalid image " "format\n"); return false; } return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod); } int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo, float *valuesToFind, int *outX, int *outY, int *outZ, int lod) { char vectorToFind[4 * 4]; float swizzled[4]; memcpy(swizzled, valuesToFind, sizeof(swizzled)); size_t vectorSize = get_pixel_size(imageInfo->format); pack_image_pixel(swizzled, imageInfo->format, vectorToFind); return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod); } template void swizzle_vector_for_image(T *srcVector, const cl_image_format *imageFormat) { T temp; switch (imageFormat->image_channel_order) { case CL_A: srcVector[0] = srcVector[3]; break; case CL_R: case CL_Rx: case CL_RG: case CL_RGx: case CL_RGB: case CL_RGBx: case CL_RGBA: case CL_sRGB: case CL_sRGBx: case CL_sRGBA: break; case CL_RA: srcVector[1] = srcVector[3]; break; case CL_ARGB: temp = srcVector[3]; srcVector[3] = srcVector[2]; srcVector[2] = srcVector[1]; srcVector[1] = srcVector[0]; srcVector[0] = temp; break; case CL_ABGR: temp = srcVector[3]; srcVector[3] = srcVector[0]; srcVector[0] = temp; temp = srcVector[2]; srcVector[2] = srcVector[1]; srcVector[1] = temp; break; case CL_BGRA: case CL_sBGRA: temp = srcVector[0]; srcVector[0] = srcVector[2]; srcVector[2] = temp; break; case CL_INTENSITY: srcVector[3] = srcVector[0]; srcVector[2] = srcVector[0]; srcVector[1] = srcVector[0]; break; case CL_LUMINANCE: srcVector[2] = srcVector[0]; srcVector[1] = srcVector[0]; break; #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: temp = srcVector[3]; srcVector[3] = srcVector[2]; srcVector[2] = srcVector[1]; srcVector[1] = srcVector[0]; srcVector[0] = temp; break; #endif #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: temp = srcVector[0]; srcVector[0] = srcVector[2]; srcVector[2] = temp; break; #endif } } #define SATURATE(v, min, max) (v < min ? min : (v > max ? max : v)) void pack_image_pixel(unsigned int *srcVector, const cl_image_format *imageFormat, void *outData) { swizzle_vector_for_image(srcVector, imageFormat); size_t channelCount = get_format_channel_count(imageFormat); switch (imageFormat->image_channel_data_type) { case CL_UNSIGNED_INT8: { unsigned char *ptr = (unsigned char *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (unsigned char)SATURATE(srcVector[i], 0, 255); break; } case CL_UNSIGNED_INT16: { unsigned short *ptr = (unsigned short *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (unsigned short)SATURATE(srcVector[i], 0, 65535); break; } case CL_UNSIGNED_INT32: { unsigned int *ptr = (unsigned int *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (unsigned int)srcVector[i]; break; } default: break; } } void pack_image_pixel(int *srcVector, const cl_image_format *imageFormat, void *outData) { swizzle_vector_for_image(srcVector, imageFormat); size_t chanelCount = get_format_channel_count(imageFormat); switch (imageFormat->image_channel_data_type) { case CL_SIGNED_INT8: { char *ptr = (char *)outData; for (unsigned int i = 0; i < chanelCount; i++) ptr[i] = (char)SATURATE(srcVector[i], -128, 127); break; } case CL_SIGNED_INT16: { short *ptr = (short *)outData; for (unsigned int i = 0; i < chanelCount; i++) ptr[i] = (short)SATURATE(srcVector[i], -32768, 32767); break; } case CL_SIGNED_INT32: { int *ptr = (int *)outData; for (unsigned int i = 0; i < chanelCount; i++) ptr[i] = (int)srcVector[i]; break; } default: break; } } cl_int round_to_even(float v) { // clamp overflow if (v >= -(float)CL_INT_MIN) return CL_INT_MAX; if (v <= (float)CL_INT_MIN) return CL_INT_MIN; // round fractional values to integer value if (fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23)) { static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23), MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) }; float magicVal = magic[v < 0.0f]; v += magicVal; v -= magicVal; } return (cl_int)v; } void pack_image_pixel(float *srcVector, const cl_image_format *imageFormat, void *outData) { swizzle_vector_for_image(srcVector, imageFormat); size_t channelCount = get_format_channel_count(imageFormat); switch (imageFormat->image_channel_data_type) { case CL_HALF_FLOAT: { cl_half *ptr = (cl_half *)outData; switch (gFloatToHalfRoundingMode) { case kRoundToNearestEven: for (unsigned int i = 0; i < channelCount; i++) ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTE); break; case kRoundTowardZero: for (unsigned int i = 0; i < channelCount; i++) ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTZ); break; default: log_error("ERROR: Test internal error -- unhandled or " "unknown float->half rounding mode.\n"); exit(-1); break; } break; } case CL_FLOAT: { cl_float *ptr = (cl_float *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = srcVector[i]; break; } case CL_SNORM_INT8: { cl_char *ptr = (cl_char *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (cl_char)NORMALIZE_SIGNED(srcVector[i], -127.0f, 127.f); break; } case CL_SNORM_INT16: { cl_short *ptr = (cl_short *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (short)NORMALIZE_SIGNED(srcVector[i], -32767.f, 32767.f); break; } case CL_UNORM_INT8: { cl_uchar *ptr = (cl_uchar *)outData; if (is_sRGBA_order(imageFormat->image_channel_order)) { ptr[0] = (unsigned char)(sRGBmap(srcVector[0]) + 0.5); ptr[1] = (unsigned char)(sRGBmap(srcVector[1]) + 0.5); ptr[2] = (unsigned char)(sRGBmap(srcVector[2]) + 0.5); if (channelCount == 4) ptr[3] = (unsigned char)NORMALIZE(srcVector[3], 255.f); } else { for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (unsigned char)NORMALIZE(srcVector[i], 255.f); } #ifdef CL_1RGB_APPLE if (imageFormat->image_channel_order == CL_1RGB_APPLE) ptr[0] = 255.0f; #endif #ifdef CL_BGR1_APPLE if (imageFormat->image_channel_order == CL_BGR1_APPLE) ptr[3] = 255.0f; #endif break; } case CL_UNORM_INT16: { cl_ushort *ptr = (cl_ushort *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (unsigned short)NORMALIZE(srcVector[i], 65535.f); break; } case CL_UNORM_SHORT_555: { cl_ushort *ptr = (cl_ushort *)outData; ptr[0] = (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 10) | (((unsigned short)NORMALIZE(srcVector[1], 31.f) & 31) << 5) | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0); break; } case CL_UNORM_SHORT_565: { cl_ushort *ptr = (cl_ushort *)outData; ptr[0] = (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 11) | (((unsigned short)NORMALIZE(srcVector[1], 63.f) & 63) << 5) | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0); break; } case CL_UNORM_INT_101010: { cl_uint *ptr = (cl_uint *)outData; ptr[0] = (((unsigned int)NORMALIZE(srcVector[0], 1023.f) & 1023) << 20) | (((unsigned int)NORMALIZE(srcVector[1], 1023.f) & 1023) << 10) | (((unsigned int)NORMALIZE(srcVector[2], 1023.f) & 1023) << 0); break; } case CL_SIGNED_INT8: { cl_char *ptr = (cl_char *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (cl_char)CONVERT_INT(srcVector[i], -127.0f, 127.f, 127); break; } case CL_SIGNED_INT16: { cl_short *ptr = (cl_short *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (short)CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767); break; } case CL_SIGNED_INT32: { cl_int *ptr = (cl_int *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = round_to_even(srcVector[i]); break; } case CL_UNSIGNED_INT8: { cl_uchar *ptr = (cl_uchar *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (cl_uchar)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX); break; } case CL_UNSIGNED_INT16: { cl_ushort *ptr = (cl_ushort *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (cl_ushort)CONVERT_UINT(srcVector[i], 32767.f, CL_USHRT_MAX); break; } case CL_UNSIGNED_INT32: { cl_uint *ptr = (cl_uint *)outData; for (unsigned int i = 0; i < channelCount; i++) ptr[i] = (cl_uint)CONVERT_UINT( srcVector[i], MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23), CL_UINT_MAX); break; } #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { cl_ushort *ptr = (cl_ushort *)outData; for (unsigned int i = 0; i < channelCount; i++) { cl_float f = fmaxf(srcVector[i], -1.0f); f = fminf(f, 3.0f); cl_int d = rintf(f * 0x1.0p14f); d += 16384; if (d > CL_USHRT_MAX) d = CL_USHRT_MAX; ptr[i] = d; } break; } #endif default: log_error("INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type); exit(-1); break; } } void pack_image_pixel_error(const float *srcVector, const cl_image_format *imageFormat, const void *results, float *errors) { size_t channelCount = get_format_channel_count(imageFormat); switch (imageFormat->image_channel_data_type) { case CL_HALF_FLOAT: { const cl_half *ptr = (const cl_half *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = Ulp_Error_Half(ptr[i], srcVector[i]); break; } case CL_FLOAT: { const cl_ushort *ptr = (const cl_ushort *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = Ulp_Error(ptr[i], srcVector[i]); break; } case CL_SNORM_INT8: { const cl_char *ptr = (const cl_char *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -127.0f, 127.f); break; } case CL_SNORM_INT16: { const cl_short *ptr = (const cl_short *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -32767.f, 32767.f); break; } case CL_UNORM_INT8: { const cl_uchar *ptr = (const cl_uchar *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 255.f); break; } case CL_UNORM_INT16: { const cl_ushort *ptr = (const cl_ushort *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 65535.f); break; } case CL_UNORM_SHORT_555: { const cl_ushort *ptr = (const cl_ushort *)results; errors[0] = ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f); errors[1] = ((ptr[0] >> 5) & 31) - NORMALIZE_UNROUNDED(srcVector[1], 31.f); errors[2] = ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f); break; } case CL_UNORM_SHORT_565: { const cl_ushort *ptr = (const cl_ushort *)results; errors[0] = ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f); errors[1] = ((ptr[0] >> 5) & 63) - NORMALIZE_UNROUNDED(srcVector[1], 63.f); errors[2] = ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f); break; } case CL_UNORM_INT_101010: { const cl_uint *ptr = (const cl_uint *)results; errors[0] = ((ptr[0] >> 20) & 1023) - NORMALIZE_UNROUNDED(srcVector[0], 1023.f); errors[1] = ((ptr[0] >> 10) & 1023) - NORMALIZE_UNROUNDED(srcVector[1], 1023.f); errors[2] = ((ptr[0] >> 0) & 1023) - NORMALIZE_UNROUNDED(srcVector[2], 1023.f); break; } case CL_SIGNED_INT8: { const cl_char *ptr = (const cl_char *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = ptr[i] - CONVERT_INT(srcVector[i], -127.0f, 127.f, 127); break; } case CL_SIGNED_INT16: { const cl_short *ptr = (const cl_short *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = ptr[i] - CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767); break; } case CL_SIGNED_INT32: { const cl_int *ptr = (const cl_int *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = (cl_float)((cl_long)ptr[i] - (cl_long)round_to_even(srcVector[i])); break; } case CL_UNSIGNED_INT8: { const cl_uchar *ptr = (const cl_uchar *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = static_cast( (cl_int)ptr[i] - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX)); break; } case CL_UNSIGNED_INT16: { const cl_ushort *ptr = (const cl_ushort *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = static_cast( (cl_int)ptr[i] - (cl_int)CONVERT_UINT(srcVector[i], 32767.f, CL_USHRT_MAX)); break; } case CL_UNSIGNED_INT32: { const cl_uint *ptr = (const cl_uint *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = (cl_float)( (cl_long)ptr[i] - (cl_long)CONVERT_UINT( srcVector[i], MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23), CL_UINT_MAX)); break; } #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { const cl_ushort *ptr = (const cl_ushort *)results; for (unsigned int i = 0; i < channelCount; i++) errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED(((int)srcVector[i] - 16384), -16384.f, 49151.f); break; } #endif default: log_error("INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type); exit(-1); break; } } // // Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT // This should be called lazily before attempting to verify image writes, // otherwise an error will occur. // int DetectFloatToHalfRoundingMode( cl_command_queue q) // Returns CL_SUCCESS on success { cl_int err = CL_SUCCESS; if (gFloatToHalfRoundingMode == kDefaultRoundingMode) { // Some numbers near 0.5f, that we look at to see how the values are // rounded. static const cl_uint inData[4 * 4] = { 0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U, 0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U, 0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U, 0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U }; static const size_t count = sizeof(inData) / (4 * sizeof(inData[0])); const float *inp = (const float *)inData; cl_context context = NULL; // Create an input buffer err = clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(context), &context, NULL); if (err) { log_error("Error: could not get context from command queue in " "DetectFloatToHalfRoundingMode (%d)", err); return err; } cl_mem inBuf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, sizeof(inData), (void *)inData, &err); if (NULL == inBuf || err) { log_error("Error: could not create input buffer in " "DetectFloatToHalfRoundingMode (err: %d)", err); return err; } // Create a small output image cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT }; cl_mem outImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &fmt, count, 1, 0, NULL, &err); if (NULL == outImage || err) { log_error("Error: could not create half float out image in " "DetectFloatToHalfRoundingMode (err: %d)", err); clReleaseMemObject(inBuf); return err; } // Create our program, and a kernel const char *kernelSource[1] = { "kernel void detect_round( global float4 *in, write_only image2d_t " "out )\n" "{\n" " write_imagef( out, (int2)(get_global_id(0),0), " "in[get_global_id(0)] );\n" "}\n" }; clProgramWrapper program; clKernelWrapper kernel; err = create_single_kernel_helper(context, &program, &kernel, 1, kernelSource, "detect_round"); if (NULL == program || err) { log_error("Error: could not create program in " "DetectFloatToHalfRoundingMode (err: %d)", err); clReleaseMemObject(inBuf); clReleaseMemObject(outImage); return err; } cl_device_id device = NULL; err = clGetCommandQueueInfo(q, CL_QUEUE_DEVICE, sizeof(device), &device, NULL); if (err) { log_error("Error: could not get device from command queue in " "DetectFloatToHalfRoundingMode (%d)", err); clReleaseMemObject(inBuf); clReleaseMemObject(outImage); return err; } err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inBuf); if (err) { log_error("Error: could not set argument 0 of kernel in " "DetectFloatToHalfRoundingMode (%d)", err); clReleaseMemObject(inBuf); clReleaseMemObject(outImage); return err; } err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outImage); if (err) { log_error("Error: could not set argument 1 of kernel in " "DetectFloatToHalfRoundingMode (%d)", err); clReleaseMemObject(inBuf); clReleaseMemObject(outImage); return err; } // Run the kernel size_t global_work_size = count; err = clEnqueueNDRangeKernel(q, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); if (err) { log_error("Error: could not enqueue kernel in " "DetectFloatToHalfRoundingMode (%d)", err); clReleaseMemObject(inBuf); clReleaseMemObject(outImage); return err; } // read the results cl_half outBuf[count * 4]; memset(outBuf, -1, sizeof(outBuf)); size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { count, 1, 1 }; err = clEnqueueReadImage(q, outImage, CL_TRUE, origin, region, 0, 0, outBuf, 0, NULL, NULL); if (err) { log_error("Error: could not read output image in " "DetectFloatToHalfRoundingMode (%d)", err); clReleaseMemObject(inBuf); clReleaseMemObject(outImage); return err; } // Generate our list of reference results cl_half rte_ref[count * 4]; cl_half rtz_ref[count * 4]; for (size_t i = 0; i < 4 * count; i++) { rte_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTE); rtz_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTZ); } // Verify that we got something in either rtz or rte mode if (0 == memcmp(rte_ref, outBuf, sizeof(rte_ref))) { log_info("Autodetected float->half rounding mode to be rte\n"); gFloatToHalfRoundingMode = kRoundToNearestEven; } else if (0 == memcmp(rtz_ref, outBuf, sizeof(rtz_ref))) { log_info("Autodetected float->half rounding mode to be rtz\n"); gFloatToHalfRoundingMode = kRoundTowardZero; } else { log_error("ERROR: float to half conversions proceed with invalid " "rounding mode!\n"); log_info("\nfor:"); for (size_t i = 0; i < count; i++) log_info(" {%a, %a, %a, %a},", inp[4 * i], inp[4 * i + 1], inp[4 * i + 2], inp[4 * i + 3]); log_info("\ngot:"); for (size_t i = 0; i < count; i++) log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", outBuf[4 * i], outBuf[4 * i + 1], outBuf[4 * i + 2], outBuf[4 * i + 3]); log_info("\nrte:"); for (size_t i = 0; i < count; i++) log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rte_ref[4 * i], rte_ref[4 * i + 1], rte_ref[4 * i + 2], rte_ref[4 * i + 3]); log_info("\nrtz:"); for (size_t i = 0; i < count; i++) log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rtz_ref[4 * i], rtz_ref[4 * i + 1], rtz_ref[4 * i + 2], rtz_ref[4 * i + 3]); log_info("\n"); err = -1; gFloatToHalfRoundingMode = kRoundingModeCount; // illegal value } // clean up clReleaseMemObject(inBuf); clReleaseMemObject(outImage); return err; } // Make sure that the rounding mode was successfully detected, if we checked // earlier if (gFloatToHalfRoundingMode != kRoundToNearestEven && gFloatToHalfRoundingMode != kRoundTowardZero) return -2; return err; } char *create_random_image_data(ExplicitType dataType, image_descriptor *imageInfo, BufferOwningPtr &P, MTdata d, bool image2DFromBuffer) { size_t allocSize, numPixels; if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1) { allocSize = (size_t)(compute_mipmapped_image_size(*imageInfo) * 4 * get_explicit_type_size(dataType)) / get_pixel_size(imageInfo->format); numPixels = allocSize / (get_explicit_type_size(dataType) * 4); } else { numPixels = (image2DFromBuffer ? imageInfo->rowPitch : imageInfo->width) * imageInfo->height * (imageInfo->depth ? imageInfo->depth : 1) * (imageInfo->arraySize ? imageInfo->arraySize : 1); allocSize = numPixels * 4 * get_explicit_type_size(dataType); } #if 0 // DEBUG { fprintf(stderr,"--- create_random_image_data:\n"); fprintf(stderr,"allocSize = %zu\n",allocSize); fprintf(stderr,"numPixels = %zu\n",numPixels); fprintf(stderr,"width = %zu\n",imageInfo->width); fprintf(stderr,"height = %zu\n",imageInfo->height); fprintf(stderr,"depth = %zu\n",imageInfo->depth); fprintf(stderr,"rowPitch = %zu\n",imageInfo->rowPitch); fprintf(stderr,"slicePitch = %zu\n",imageInfo->slicePitch); fprintf(stderr,"arraySize = %zu\n",imageInfo->arraySize); fprintf(stderr,"explicit_type_size = %zu\n",get_explicit_type_size(dataType)); } #endif #if defined(__APPLE__) char *data = NULL; if (gDeviceType == CL_DEVICE_TYPE_CPU) { size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192; // alloc two extra pages. void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0); if (map == MAP_FAILED) { perror("create_random_image_data: mmap"); log_error("%s:%d: mmap failed, mapSize = %zu\n", __FILE__, __LINE__, mapSize); } intptr_t data_end = (intptr_t)map + mapSize - 4096; data = (char *)(data_end - (intptr_t)allocSize); mprotect(map, 4096, PROT_NONE); mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE); P.reset(data, map, mapSize); } else { data = (char *)malloc(allocSize); P.reset(data); } #else char *data = (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format)); P.reset(data, NULL, 0, allocSize, true); #endif if (data == NULL) { log_error( "ERROR: Unable to malloc %zu bytes for create_random_image_data\n", allocSize); return NULL; } switch (dataType) { case kFloat: { float *inputValues = (float *)data; switch (imageInfo->format->image_channel_data_type) { case CL_HALF_FLOAT: { // Generate data that is (mostly) inside the range of a half // float const float HALF_MIN = 5.96046448e-08f; const float HALF_MAX = 65504.0f; size_t i = 0; inputValues[i++] = 0.f; inputValues[i++] = 1.f; inputValues[i++] = -1.f; inputValues[i++] = 2.f; for (; i < numPixels * 4; i++) inputValues[i] = get_random_float(-HALF_MAX - 2.f, HALF_MAX + 2.f, d); } break; #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { size_t i = 0; if (numPixels * 4 >= 8) { inputValues[i++] = INFINITY; inputValues[i++] = 0x1.0p14f; inputValues[i++] = 0x1.0p31f; inputValues[i++] = 0x1.0p32f; inputValues[i++] = -INFINITY; inputValues[i++] = -0x1.0p14f; inputValues[i++] = -0x1.0p31f; inputValues[i++] = -0x1.1p31f; } for (; i < numPixels * 4; i++) inputValues[i] = get_random_float(-1.1f, 3.1f, d); } break; #endif case CL_FLOAT: { size_t i = 0; inputValues[i++] = INFINITY; inputValues[i++] = -INFINITY; inputValues[i++] = 0.0f; inputValues[i++] = 0.0f; cl_uint *p = (cl_uint *)data; for (; i < numPixels * 4; i++) p[i] = genrand_int32(d); } break; default: size_t i = 0; if (numPixels * 4 >= 36) { inputValues[i++] = 0.0f; inputValues[i++] = 0.5f; inputValues[i++] = 31.5f; inputValues[i++] = 32.0f; inputValues[i++] = 127.5f; inputValues[i++] = 128.0f; inputValues[i++] = 255.5f; inputValues[i++] = 256.0f; inputValues[i++] = 1023.5f; inputValues[i++] = 1024.0f; inputValues[i++] = 32767.5f; inputValues[i++] = 32768.0f; inputValues[i++] = 65535.5f; inputValues[i++] = 65536.0f; inputValues[i++] = 2147483648.0f; inputValues[i++] = 4294967296.0f; inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p63f, 1, 63); inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p64f, 1, 64); inputValues[i++] = -0.0f; inputValues[i++] = -0.5f; inputValues[i++] = -31.5f; inputValues[i++] = -32.0f; inputValues[i++] = -127.5f; inputValues[i++] = -128.0f; inputValues[i++] = -255.5f; inputValues[i++] = -256.0f; inputValues[i++] = -1023.5f; inputValues[i++] = -1024.0f; inputValues[i++] = -32767.5f; inputValues[i++] = -32768.0f; inputValues[i++] = -65535.5f; inputValues[i++] = -65536.0f; inputValues[i++] = -2147483648.0f; inputValues[i++] = -4294967296.0f; inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p63f, 1, 63); inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p64f, 1, 64); } if (is_format_signed(imageInfo->format)) { for (; i < numPixels * 4; i++) inputValues[i] = get_random_float(-1.1f, 1.1f, d); } else { for (; i < numPixels * 4; i++) inputValues[i] = get_random_float(-0.1f, 1.1f, d); } break; } break; } case kInt: { int *imageData = (int *)data; // We want to generate ints (mostly) in range of the target format int formatMin = get_format_min_int(imageInfo->format); size_t formatMax = get_format_max_int(imageInfo->format); if (formatMin == 0) { // Unsigned values, but we are only an int, so cap the actual // max at the max of signed ints if (formatMax > 2147483647L) formatMax = 2147483647L; } // If the final format is small enough, give us a bit of room for // out-of-range values to test if (formatMax < 2147483647L) formatMax += 2; if (formatMin > -2147483648LL) formatMin -= 2; // Now gen for (size_t i = 0; i < numPixels * 4; i++) { imageData[i] = random_in_range(formatMin, (int)formatMax, d); } break; } case kUInt: case kUnsignedInt: { unsigned int *imageData = (unsigned int *)data; // We want to generate ints (mostly) in range of the target format int formatMin = get_format_min_int(imageInfo->format); size_t formatMax = get_format_max_int(imageInfo->format); if (formatMin < 0) formatMin = 0; // If the final format is small enough, give us a bit of room for // out-of-range values to test if (formatMax < 4294967295LL) formatMax += 2; // Now gen for (size_t i = 0; i < numPixels * 4; i++) { imageData[i] = random_in_range(formatMin, (int)formatMax, d); } break; } default: // Unsupported source format delete[] data; return NULL; } return data; } /* deprecated bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t max, int &outValue ) { int v = (int)value; switch(imageSampler->addressing_mode) { case CL_ADDRESS_REPEAT: outValue = v; while( v < 0 ) v += (int)max; while( v >= (int)max ) v -= (int)max; if( v != outValue ) { outValue = v; return true; } return false; case CL_ADDRESS_MIRRORED_REPEAT: log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do we ever use this? exit(-1); default: if( v < 0 ) { outValue = 0; return true; } if( v >= (int)max ) { outValue = (int)max - 1; return true; } outValue = v; return false; } } */ void get_sampler_kernel_code(image_sampler_data *imageSampler, char *outLine) { const char *normalized; const char *addressMode; const char *filterMode; if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP) addressMode = "CLK_ADDRESS_CLAMP"; else if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE) addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE"; else if (imageSampler->addressing_mode == CL_ADDRESS_REPEAT) addressMode = "CLK_ADDRESS_REPEAT"; else if (imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT) addressMode = "CLK_ADDRESS_MIRRORED_REPEAT"; else if (imageSampler->addressing_mode == CL_ADDRESS_NONE) addressMode = "CLK_ADDRESS_NONE"; else { log_error("**Error: Unknown addressing mode! Aborting...\n"); abort(); } if (imageSampler->normalized_coords) normalized = "CLK_NORMALIZED_COORDS_TRUE"; else normalized = "CLK_NORMALIZED_COORDS_FALSE"; if (imageSampler->filter_mode == CL_FILTER_LINEAR) filterMode = "CLK_FILTER_LINEAR"; else filterMode = "CLK_FILTER_NEAREST"; sprintf(outLine, " const sampler_t imageSampler = %s | %s | %s;\n", addressMode, filterMode, normalized); } void copy_image_data(image_descriptor *srcImageInfo, image_descriptor *dstImageInfo, void *imageValues, void *destImageValues, const size_t sourcePos[], const size_t destPos[], const size_t regionSize[]) { // assert( srcImageInfo->format == dstImageInfo->format ); size_t src_mip_level_offset = 0, dst_mip_level_offset = 0; size_t sourcePos_lod[3], destPos_lod[3], src_lod, dst_lod; size_t src_row_pitch_lod, src_slice_pitch_lod; size_t dst_row_pitch_lod, dst_slice_pitch_lod; size_t pixelSize = get_pixel_size(srcImageInfo->format); sourcePos_lod[0] = sourcePos[0]; sourcePos_lod[1] = sourcePos[1]; sourcePos_lod[2] = sourcePos[2]; destPos_lod[0] = destPos[0]; destPos_lod[1] = destPos[1]; destPos_lod[2] = destPos[2]; src_row_pitch_lod = srcImageInfo->rowPitch; dst_row_pitch_lod = dstImageInfo->rowPitch; src_slice_pitch_lod = srcImageInfo->slicePitch; dst_slice_pitch_lod = dstImageInfo->slicePitch; if (srcImageInfo->num_mip_levels > 1) { size_t src_width_lod = 1 /*srcImageInfo->width*/; size_t src_height_lod = 1 /*srcImageInfo->height*/; size_t src_depth_lod = 1 /*srcImageInfo->depth*/; switch (srcImageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: src_lod = sourcePos[1]; sourcePos_lod[1] = sourcePos_lod[2] = 0; src_width_lod = (srcImageInfo->width >> src_lod) ? (srcImageInfo->width >> src_lod) : 1; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D: src_lod = sourcePos[2]; sourcePos_lod[1] = sourcePos[1]; sourcePos_lod[2] = 0; src_width_lod = (srcImageInfo->width >> src_lod) ? (srcImageInfo->width >> src_lod) : 1; if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE2D) src_height_lod = (srcImageInfo->height >> src_lod) ? (srcImageInfo->height >> src_lod) : 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE3D: src_lod = sourcePos[3]; sourcePos_lod[1] = sourcePos[1]; sourcePos_lod[2] = sourcePos[2]; src_width_lod = (srcImageInfo->width >> src_lod) ? (srcImageInfo->width >> src_lod) : 1; src_height_lod = (srcImageInfo->height >> src_lod) ? (srcImageInfo->height >> src_lod) : 1; if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE3D) src_depth_lod = (srcImageInfo->depth >> src_lod) ? (srcImageInfo->depth >> src_lod) : 1; break; } src_mip_level_offset = compute_mip_level_offset(srcImageInfo, src_lod); src_row_pitch_lod = src_width_lod * get_pixel_size(srcImageInfo->format); src_slice_pitch_lod = src_row_pitch_lod * src_height_lod; } if (dstImageInfo->num_mip_levels > 1) { size_t dst_width_lod = 1 /*dstImageInfo->width*/; size_t dst_height_lod = 1 /*dstImageInfo->height*/; size_t dst_depth_lod = 1 /*dstImageInfo->depth*/; switch (dstImageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: dst_lod = destPos[1]; destPos_lod[1] = destPos_lod[2] = 0; dst_width_lod = (dstImageInfo->width >> dst_lod) ? (dstImageInfo->width >> dst_lod) : 1; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D: dst_lod = destPos[2]; destPos_lod[1] = destPos[1]; destPos_lod[2] = 0; dst_width_lod = (dstImageInfo->width >> dst_lod) ? (dstImageInfo->width >> dst_lod) : 1; if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D) dst_height_lod = (dstImageInfo->height >> dst_lod) ? (dstImageInfo->height >> dst_lod) : 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE3D: dst_lod = destPos[3]; destPos_lod[1] = destPos[1]; destPos_lod[2] = destPos[2]; dst_width_lod = (dstImageInfo->width >> dst_lod) ? (dstImageInfo->width >> dst_lod) : 1; dst_height_lod = (dstImageInfo->height >> dst_lod) ? (dstImageInfo->height >> dst_lod) : 1; if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D) dst_depth_lod = (dstImageInfo->depth >> dst_lod) ? (dstImageInfo->depth >> dst_lod) : 1; break; } dst_mip_level_offset = compute_mip_level_offset(dstImageInfo, dst_lod); dst_row_pitch_lod = dst_width_lod * get_pixel_size(dstImageInfo->format); dst_slice_pitch_lod = dst_row_pitch_lod * dst_height_lod; } // Get initial pointers char *sourcePtr = (char *)imageValues + sourcePos_lod[2] * src_slice_pitch_lod + sourcePos_lod[1] * src_row_pitch_lod + pixelSize * sourcePos_lod[0] + src_mip_level_offset; char *destPtr = (char *)destImageValues + destPos_lod[2] * dst_slice_pitch_lod + destPos_lod[1] * dst_row_pitch_lod + pixelSize * destPos_lod[0] + dst_mip_level_offset; for (size_t z = 0; z < (regionSize[2] > 0 ? regionSize[2] : 1); z++) { char *rowSourcePtr = sourcePtr; char *rowDestPtr = destPtr; for (size_t y = 0; y < regionSize[1]; y++) { memcpy(rowDestPtr, rowSourcePtr, pixelSize * regionSize[0]); rowSourcePtr += src_row_pitch_lod; rowDestPtr += dst_row_pitch_lod; } sourcePtr += src_slice_pitch_lod; destPtr += dst_slice_pitch_lod; } } float random_float(float low, float high, MTdata d) { float t = (float)genrand_real1(d); return (1.0f - t) * low + t * high; } CoordWalker::CoordWalker(void *coords, bool useFloats, size_t vecSize) { if (useFloats) { mFloatCoords = (cl_float *)coords; mIntCoords = NULL; } else { mFloatCoords = NULL; mIntCoords = (cl_int *)coords; } mVecSize = vecSize; } CoordWalker::~CoordWalker() {} cl_float CoordWalker::Get(size_t idx, size_t el) { if (mIntCoords != NULL) return (cl_float)mIntCoords[idx * mVecSize + el]; else return mFloatCoords[idx * mVecSize + el]; } void print_read_header(const cl_image_format *format, image_sampler_data *sampler, bool err, int t) { const char *addressMode = NULL; const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" }; if (sampler->addressing_mode == CL_ADDRESS_CLAMP) addressMode = "CL_ADDRESS_CLAMP"; else if (sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE) addressMode = "CL_ADDRESS_CLAMP_TO_EDGE"; else if (sampler->addressing_mode == CL_ADDRESS_REPEAT) addressMode = "CL_ADDRESS_REPEAT"; else if (sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT) addressMode = "CL_ADDRESS_MIRRORED_REPEAT"; else addressMode = "CL_ADDRESS_NONE"; if (t) { if (err) log_error("[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0], t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED"); else log_info("[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0], t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED"); } else { if (err) log_error("[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0]); else log_info("[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0]); } } void print_write_header(const cl_image_format *format, bool err = false) { if (err) log_error("[%-7s %-24s %d]\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format)); else log_info("[%-7s %-24s %d]\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format)); } void print_header(const cl_image_format *format, bool err = false) { if (err) { log_error("[%-7s %-24s %d]\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format)); } else { log_info("[%-7s %-24s %d]\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type), (int)get_format_channel_count(format)); } } bool find_format(cl_image_format *formatList, unsigned int numFormats, cl_image_format *formatToFind) { for (unsigned int i = 0; i < numFormats; i++) { if (formatList[i].image_channel_order == formatToFind->image_channel_order && formatList[i].image_channel_data_type == formatToFind->image_channel_data_type) return true; } return false; } void build_required_image_formats( cl_mem_flags flags, cl_mem_object_type image_type, cl_device_id device, std::vector &formatsToSupport) { formatsToSupport.clear(); // Minimum list of supported image formats for reading or writing (embedded // profile) static std::vector embeddedProfile_readOrWrite{ // clang-format off { CL_RGBA, CL_UNORM_INT8 }, { CL_RGBA, CL_UNORM_INT16 }, { CL_RGBA, CL_SIGNED_INT8 }, { CL_RGBA, CL_SIGNED_INT16 }, { CL_RGBA, CL_SIGNED_INT32 }, { CL_RGBA, CL_UNSIGNED_INT8 }, { CL_RGBA, CL_UNSIGNED_INT16 }, { CL_RGBA, CL_UNSIGNED_INT32 }, { CL_RGBA, CL_HALF_FLOAT }, { CL_RGBA, CL_FLOAT }, // clang-format on }; // Minimum list of required image formats for reading or writing // num_channels, for all image types. static std::vector fullProfile_readOrWrite{ // clang-format off { CL_RGBA, CL_UNORM_INT8 }, { CL_RGBA, CL_UNORM_INT16 }, { CL_RGBA, CL_SIGNED_INT8 }, { CL_RGBA, CL_SIGNED_INT16 }, { CL_RGBA, CL_SIGNED_INT32 }, { CL_RGBA, CL_UNSIGNED_INT8 }, { CL_RGBA, CL_UNSIGNED_INT16 }, { CL_RGBA, CL_UNSIGNED_INT32 }, { CL_RGBA, CL_HALF_FLOAT }, { CL_RGBA, CL_FLOAT }, { CL_BGRA, CL_UNORM_INT8 }, // clang-format on }; // Minimum list of supported image formats for reading or writing // (OpenCL 2.0, 2.1, or 2.2), for all image types. static std::vector fullProfile_2x_readOrWrite{ // clang-format off { CL_R, CL_UNORM_INT8 }, { CL_R, CL_UNORM_INT16 }, { CL_R, CL_SNORM_INT8 }, { CL_R, CL_SNORM_INT16 }, { CL_R, CL_SIGNED_INT8 }, { CL_R, CL_SIGNED_INT16 }, { CL_R, CL_SIGNED_INT32 }, { CL_R, CL_UNSIGNED_INT8 }, { CL_R, CL_UNSIGNED_INT16 }, { CL_R, CL_UNSIGNED_INT32 }, { CL_R, CL_HALF_FLOAT }, { CL_R, CL_FLOAT }, { CL_RG, CL_UNORM_INT8 }, { CL_RG, CL_UNORM_INT16 }, { CL_RG, CL_SNORM_INT8 }, { CL_RG, CL_SNORM_INT16 }, { CL_RG, CL_SIGNED_INT8 }, { CL_RG, CL_SIGNED_INT16 }, { CL_RG, CL_SIGNED_INT32 }, { CL_RG, CL_UNSIGNED_INT8 }, { CL_RG, CL_UNSIGNED_INT16 }, { CL_RG, CL_UNSIGNED_INT32 }, { CL_RG, CL_HALF_FLOAT }, { CL_RG, CL_FLOAT }, { CL_RGBA, CL_UNORM_INT8 }, { CL_RGBA, CL_UNORM_INT16 }, { CL_RGBA, CL_SNORM_INT8 }, { CL_RGBA, CL_SNORM_INT16 }, { CL_RGBA, CL_SIGNED_INT8 }, { CL_RGBA, CL_SIGNED_INT16 }, { CL_RGBA, CL_SIGNED_INT32 }, { CL_RGBA, CL_UNSIGNED_INT8 }, { CL_RGBA, CL_UNSIGNED_INT16 }, { CL_RGBA, CL_UNSIGNED_INT32 }, { CL_RGBA, CL_HALF_FLOAT }, { CL_RGBA, CL_FLOAT }, { CL_BGRA, CL_UNORM_INT8 }, // clang-format on }; // Conditional addition to the 2x readOrWrite table: // Support for the CL_DEPTH image channel order is required only for 2D // images and 2D image arrays. static std::vector fullProfile_2x_readOrWrite_Depth{ // clang-format off { CL_DEPTH, CL_UNORM_INT16 }, { CL_DEPTH, CL_FLOAT }, // clang-format on }; // Conditional addition to the 2x readOrWrite table: // Support for reading from the CL_sRGBA image channel order is optional for // 1D image buffers. Support for writing to the CL_sRGBA image channel order // is optional for all image types. static std::vector fullProfile_2x_readOrWrite_srgb{ { CL_sRGBA, CL_UNORM_INT8 }, }; // Minimum list of required image formats for reading and writing. static std::vector fullProfile_readAndWrite{ // clang-format off { CL_R, CL_UNORM_INT8 }, { CL_R, CL_SIGNED_INT8 }, { CL_R, CL_SIGNED_INT16 }, { CL_R, CL_SIGNED_INT32 }, { CL_R, CL_UNSIGNED_INT8 }, { CL_R, CL_UNSIGNED_INT16 }, { CL_R, CL_UNSIGNED_INT32 }, { CL_R, CL_HALF_FLOAT }, { CL_R, CL_FLOAT }, { CL_RGBA, CL_UNORM_INT8 }, { CL_RGBA, CL_SIGNED_INT8 }, { CL_RGBA, CL_SIGNED_INT16 }, { CL_RGBA, CL_SIGNED_INT32 }, { CL_RGBA, CL_UNSIGNED_INT8 }, { CL_RGBA, CL_UNSIGNED_INT16 }, { CL_RGBA, CL_UNSIGNED_INT32 }, { CL_RGBA, CL_HALF_FLOAT }, { CL_RGBA, CL_FLOAT }, // clang-format on }; // Embedded profile if (gIsEmbedded) { copy(embeddedProfile_readOrWrite.begin(), embeddedProfile_readOrWrite.end(), back_inserter(formatsToSupport)); } // Full profile else { Version version = get_device_cl_version(device); if (version < Version(2, 0) || version >= Version(3, 0)) { // Full profile, OpenCL 1.2 or 3.0. if (flags & CL_MEM_KERNEL_READ_AND_WRITE) { // Note: assumes that read-write images are supported! copy(fullProfile_readAndWrite.begin(), fullProfile_readAndWrite.end(), back_inserter(formatsToSupport)); } else { copy(fullProfile_readOrWrite.begin(), fullProfile_readOrWrite.end(), back_inserter(formatsToSupport)); } } else { // Full profile, OpenCL 2.0, 2.1, 2.2. if (flags & CL_MEM_KERNEL_READ_AND_WRITE) { copy(fullProfile_readAndWrite.begin(), fullProfile_readAndWrite.end(), back_inserter(formatsToSupport)); } else { copy(fullProfile_2x_readOrWrite.begin(), fullProfile_2x_readOrWrite.end(), back_inserter(formatsToSupport)); // Support for the CL_DEPTH image channel order is required only // for 2D images and 2D image arrays. if (image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { copy(fullProfile_2x_readOrWrite_Depth.begin(), fullProfile_2x_readOrWrite_Depth.end(), back_inserter(formatsToSupport)); } // Support for reading from the CL_sRGBA image channel order is // optional for 1D image buffers. Support for writing to the // CL_sRGBA image channel order is optional for all image types. if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER && flags == CL_MEM_READ_ONLY) { copy(fullProfile_2x_readOrWrite_srgb.begin(), fullProfile_2x_readOrWrite_srgb.end(), back_inserter(formatsToSupport)); } } } } } bool is_image_format_required(cl_image_format format, cl_mem_flags flags, cl_mem_object_type image_type, cl_device_id device) { std::vector formatsToSupport; build_required_image_formats(flags, image_type, device, formatsToSupport); for (auto &formatItr : formatsToSupport) { if (formatItr.image_channel_order == format.image_channel_order && formatItr.image_channel_data_type == format.image_channel_data_type) { return true; } } return false; } cl_uint compute_max_mip_levels(size_t width, size_t height, size_t depth) { cl_uint retMaxMipLevels = 0; size_t max_dim = 0; max_dim = width; max_dim = height > max_dim ? height : max_dim; max_dim = depth > max_dim ? depth : max_dim; while (max_dim) { retMaxMipLevels++; max_dim >>= 1; } return retMaxMipLevels; } cl_ulong compute_mipmapped_image_size(image_descriptor imageInfo) { cl_ulong retSize = 0; size_t curr_width, curr_height, curr_depth, curr_array_size; curr_width = imageInfo.width; curr_height = imageInfo.height; curr_depth = imageInfo.depth; curr_array_size = imageInfo.arraySize; for (int i = 0; i < (int)imageInfo.num_mip_levels; i++) { switch (imageInfo.type) { case CL_MEM_OBJECT_IMAGE3D: retSize += (cl_ulong)curr_width * curr_height * curr_depth * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE2D: retSize += (cl_ulong)curr_width * curr_height * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE1D: retSize += (cl_ulong)curr_width * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: retSize += (cl_ulong)curr_width * curr_array_size * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: retSize += (cl_ulong)curr_width * curr_height * curr_array_size * get_pixel_size(imageInfo.format); break; } switch (imageInfo.type) { case CL_MEM_OBJECT_IMAGE3D: curr_depth = curr_depth >> 1 ? curr_depth >> 1 : 1; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: curr_height = curr_height >> 1 ? curr_height >> 1 : 1; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_ARRAY: curr_width = curr_width >> 1 ? curr_width >> 1 : 1; } } return retSize; } size_t compute_mip_level_offset(image_descriptor *imageInfo, size_t lod) { size_t retOffset = 0; size_t width, height, depth; width = imageInfo->width; height = imageInfo->height; depth = imageInfo->depth; for (size_t i = 0; i < lod; i++) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE2D_ARRAY: retOffset += (size_t)width * height * imageInfo->arraySize * get_pixel_size(imageInfo->format); break; case CL_MEM_OBJECT_IMAGE3D: retOffset += (size_t)width * height * depth * get_pixel_size(imageInfo->format); break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: retOffset += (size_t)width * imageInfo->arraySize * get_pixel_size(imageInfo->format); break; case CL_MEM_OBJECT_IMAGE2D: retOffset += (size_t)width * height * get_pixel_size(imageInfo->format); break; case CL_MEM_OBJECT_IMAGE1D: retOffset += (size_t)width * get_pixel_size(imageInfo->format); break; } // Compute next lod dimensions switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D: depth = (depth >> 1) ? (depth >> 1) : 1; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height = (height >> 1) ? (height >> 1) : 1; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE1D: width = (width >> 1) ? (width >> 1) : 1; } } return retOffset; } const char *convert_image_type_to_string(cl_mem_object_type image_type) { switch (image_type) { case CL_MEM_OBJECT_IMAGE1D: return "1D"; case CL_MEM_OBJECT_IMAGE2D: return "2D"; case CL_MEM_OBJECT_IMAGE3D: return "3D"; case CL_MEM_OBJECT_IMAGE1D_ARRAY: return "1D array"; case CL_MEM_OBJECT_IMAGE2D_ARRAY: return "2D array"; case CL_MEM_OBJECT_IMAGE1D_BUFFER: return "1D image buffer"; default: return "unrecognized object type"; } }