OpenCL-CTS/test_common/harness/imageHelpers.cpp

//
// Copyright (c) 2017,2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "imageHelpers.h"
#include <limits.h>
#include <assert.h>
#if defined(__APPLE__)
#include <sys/mman.h>
#endif
#if !defined(_WIN32) && !defined(__APPLE__)
#include <malloc.h>
#endif
#include <algorithm>
#include <cinttypes>
#include <iterator>
#if !defined(_WIN32)
#include <cmath>
#endif
#ifdef __SSE2__
#include <x86intrin.h>
#if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 12)                \
    || (defined(__clang__) && !defined(__apple_build_version__)                \
        && __clang_major__ < 8)
// Add missing intrinsics that aren't in ancient compilers, but are used below
#ifdef __clang__
#define NODEBUG nodebug
#else
#define NODEBUG artificial
#endif
static inline __attribute__((always_inline, NODEBUG)) __m128i
_mm_loadu_si16(const void *p)
{
    struct _loadu_si16
    {
        short v;
    } __attribute__((packed, may_alias));
    short u = ((const struct _loadu_si16 *)p)->v;
    return _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, u);
}

static inline __attribute__((always_inline, NODEBUG)) __m128i
_mm_loadu_si32(const void *p)
{
    struct _loadu_si32
    {
        int v;
    } __attribute__((packed, may_alias));
    int u = ((const struct _loadu_si32 *)p)->v;
    return _mm_set_epi32(0, 0, 0, u);
}
#undef NODEBUG
#endif
#endif

RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;

cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
bool gTestRounding = false;
double sRGBmap(float fc)
{
    double c = (double)fc;

#if !defined(_WIN32)
    if (std::isnan(c)) c = 0.0;
#else
    if (_isnan(c)) c = 0.0;
#endif

    if (c > 1.0)
        c = 1.0;
    else if (c < 0.0)
        c = 0.0;
    else if (c < 0.0031308)
        c = 12.92 * c;
    else
        c = (1055.0 / 1000.0) * pow(c, 5.0 / 12.0) - (55.0 / 1000.0);

    return c * 255.0;
}

double sRGBunmap(float fc)
{
    double c = (double)fc;
    double result;

    if (c <= 0.04045)
        result = c / 12.92;
    else
        result = pow((c + 0.055) / 1.055, 2.4);

    return result;
}

// Precalculated table of linear encodings of sRGB values
float gSRGBTbl[] = {
    0x0.000000p+00f, 0x1.3e4569p-12f, 0x1.3e4569p-11f, 0x1.dd681cp-11f,
    0x1.3e4569p-10f, 0x1.8dd6c2p-10f, 0x1.dd681cp-10f, 0x1.167cbbp-09f,
    0x1.3e4569p-09f, 0x1.660e15p-09f, 0x1.8dd6c2p-09f, 0x1.b6a31cp-09f,
    0x1.e1e31ep-09f, 0x1.07c38cp-08f, 0x1.1fcc2cp-08f, 0x1.390ffbp-08f,
    0x1.53936ep-08f, 0x1.6f5adfp-08f, 0x1.8c6a96p-08f, 0x1.aac6c3p-08f,
    0x1.ca7383p-08f, 0x1.eb74e3p-08f, 0x1.06e76cp-07f, 0x1.18c2a6p-07f,
    0x1.2b4e0ap-07f, 0x1.3e8b7cp-07f, 0x1.527cd7p-07f, 0x1.6723efp-07f,
    0x1.7c8293p-07f, 0x1.929a89p-07f, 0x1.a96d92p-07f, 0x1.c0fd67p-07f,
    0x1.d94bc1p-07f, 0x1.f25a47p-07f, 0x1.061553p-06f, 0x1.135f40p-06f,
    0x1.210bbap-06f, 0x1.2f1b8ep-06f, 0x1.3d8f85p-06f, 0x1.4c6868p-06f,
    0x1.5ba6fcp-06f, 0x1.6b4c05p-06f, 0x1.7b5843p-06f, 0x1.8bcc76p-06f,
    0x1.9ca95ap-06f, 0x1.adefabp-06f, 0x1.bfa021p-06f, 0x1.d1bb75p-06f,
    0x1.e4425ap-06f, 0x1.f73586p-06f, 0x1.054ad5p-05f, 0x1.0f31bbp-05f,
    0x1.194fccp-05f, 0x1.23a55fp-05f, 0x1.2e32c9p-05f, 0x1.38f860p-05f,
    0x1.43f678p-05f, 0x1.4f2d63p-05f, 0x1.5a9d76p-05f, 0x1.664701p-05f,
    0x1.722a57p-05f, 0x1.7e47c8p-05f, 0x1.8a9fa3p-05f, 0x1.973239p-05f,
    0x1.a3ffdcp-05f, 0x1.b108d3p-05f, 0x1.be4d6fp-05f, 0x1.cbcdfdp-05f,
    0x1.d98ac9p-05f, 0x1.e78420p-05f, 0x1.f5ba4cp-05f, 0x1.0216ccp-04f,
    0x1.096f28p-04f, 0x1.10e65ep-04f, 0x1.187c92p-04f, 0x1.2031eap-04f,
    0x1.280689p-04f, 0x1.2ffa93p-04f, 0x1.380e2bp-04f, 0x1.404176p-04f,
    0x1.489496p-04f, 0x1.5107aep-04f, 0x1.599ae0p-04f, 0x1.624e50p-04f,
    0x1.6b2220p-04f, 0x1.741672p-04f, 0x1.7d2b67p-04f, 0x1.866121p-04f,
    0x1.8fb7c1p-04f, 0x1.992f6ap-04f, 0x1.a2c83cp-04f, 0x1.ac8257p-04f,
    0x1.b65dddp-04f, 0x1.c05aeep-04f, 0x1.ca79aap-04f, 0x1.d4ba31p-04f,
    0x1.df1ca4p-04f, 0x1.e9a122p-04f, 0x1.f447cap-04f, 0x1.ff10bdp-04f,
    0x1.04fe0dp-03f, 0x1.0a84ffp-03f, 0x1.101d45p-03f, 0x1.15c6eep-03f,
    0x1.1b8209p-03f, 0x1.214ea6p-03f, 0x1.272cd4p-03f, 0x1.2d1ca2p-03f,
    0x1.331e1fp-03f, 0x1.393159p-03f, 0x1.3f5660p-03f, 0x1.458d43p-03f,
    0x1.4bd60fp-03f, 0x1.5230d4p-03f, 0x1.589da1p-03f, 0x1.5f1c83p-03f,
    0x1.65ad8ap-03f, 0x1.6c50c3p-03f, 0x1.73063ep-03f, 0x1.79ce07p-03f,
    0x1.80a82ep-03f, 0x1.8794c0p-03f, 0x1.8e93cbp-03f, 0x1.95a55ep-03f,
    0x1.9cc987p-03f, 0x1.a40052p-03f, 0x1.ab49cfp-03f, 0x1.b2a60ap-03f,
    0x1.ba1516p-03f, 0x1.c196f7p-03f, 0x1.c92bc1p-03f, 0x1.d0d380p-03f,
    0x1.d88e41p-03f, 0x1.e05c12p-03f, 0x1.e83d00p-03f, 0x1.f0311ap-03f,
    0x1.f8386ap-03f, 0x1.002980p-02f, 0x1.044074p-02f, 0x1.086118p-02f,
    0x1.0c8b72p-02f, 0x1.10bf88p-02f, 0x1.14fd61p-02f, 0x1.194504p-02f,
    0x1.1d9677p-02f, 0x1.21f1c0p-02f, 0x1.2656e6p-02f, 0x1.2ac5efp-02f,
    0x1.2f3ee1p-02f, 0x1.33c1c3p-02f, 0x1.384e9ap-02f, 0x1.3ce56ep-02f,
    0x1.418644p-02f, 0x1.463122p-02f, 0x1.4ae610p-02f, 0x1.4fa512p-02f,
    0x1.546e2fp-02f, 0x1.59416dp-02f, 0x1.5e1ed2p-02f, 0x1.630665p-02f,
    0x1.67f82bp-02f, 0x1.6cf42bp-02f, 0x1.71fa69p-02f, 0x1.770aedp-02f,
    0x1.7c25bdp-02f, 0x1.814addp-02f, 0x1.867a55p-02f, 0x1.8bb42ap-02f,
    0x1.90f862p-02f, 0x1.964703p-02f, 0x1.9ba012p-02f, 0x1.a10396p-02f,
    0x1.a67194p-02f, 0x1.abea12p-02f, 0x1.b16d16p-02f, 0x1.b6faa6p-02f,
    0x1.bc92c7p-02f, 0x1.c2357fp-02f, 0x1.c7e2d4p-02f, 0x1.cd9acbp-02f,
    0x1.d35d6bp-02f, 0x1.d92ab8p-02f, 0x1.df02b9p-02f, 0x1.e4e572p-02f,
    0x1.ead2ebp-02f, 0x1.f0cb27p-02f, 0x1.f6ce2dp-02f, 0x1.fcdc02p-02f,
    0x1.017a56p-01f, 0x1.048c18p-01f, 0x1.07a34bp-01f, 0x1.0abfefp-01f,
    0x1.0de209p-01f, 0x1.11099cp-01f, 0x1.1436a9p-01f, 0x1.176933p-01f,
    0x1.1aa13ep-01f, 0x1.1ddecbp-01f, 0x1.2121dfp-01f, 0x1.246a7ap-01f,
    0x1.27b8a0p-01f, 0x1.2b0c54p-01f, 0x1.2e6598p-01f, 0x1.31c46fp-01f,
    0x1.3528dcp-01f, 0x1.3892e1p-01f, 0x1.3c0280p-01f, 0x1.3f77bdp-01f,
    0x1.42f29ap-01f, 0x1.46731ap-01f, 0x1.49f93ep-01f, 0x1.4d850bp-01f,
    0x1.511682p-01f, 0x1.54ada5p-01f, 0x1.584a79p-01f, 0x1.5becfep-01f,
    0x1.5f9538p-01f, 0x1.634329p-01f, 0x1.66f6d4p-01f, 0x1.6ab03bp-01f,
    0x1.6e6f61p-01f, 0x1.723449p-01f, 0x1.75fef4p-01f, 0x1.79cf65p-01f,
    0x1.7da59fp-01f, 0x1.8181a5p-01f, 0x1.856378p-01f, 0x1.894b1cp-01f,
    0x1.8d3892p-01f, 0x1.912bdep-01f, 0x1.952501p-01f, 0x1.9923ffp-01f,
    0x1.9d28d9p-01f, 0x1.a13392p-01f, 0x1.a5442dp-01f, 0x1.a95aacp-01f,
    0x1.ad7711p-01f, 0x1.b1995fp-01f, 0x1.b5c198p-01f, 0x1.b9efbep-01f,
    0x1.be23d5p-01f, 0x1.c25ddep-01f, 0x1.c69ddcp-01f, 0x1.cae3d1p-01f,
    0x1.cf2fc0p-01f, 0x1.d381abp-01f, 0x1.d7d994p-01f, 0x1.dc377ep-01f,
    0x1.e09b6bp-01f, 0x1.e5055dp-01f, 0x1.e97557p-01f, 0x1.edeb5bp-01f,
    0x1.f2676cp-01f, 0x1.f6e98bp-01f, 0x1.fb71bcp-01f, 0x1.000000p+00f,
};


float sRGBunmap(cl_uchar ic) { return gSRGBTbl[ic]; }

#ifdef __SSE2__
__m128 sRGBunmap(const __m128i ic)
{
    static const float recip_255 = 1.0f / 255.0f;
#ifdef __AVX2__
    __m128 fc = _mm_i32gather_ps(gSRGBTbl, _mm_cvtepu8_epi32(ic), 4);
    // only RGB need to be converted for sRGBA
    return _mm_insert_ps(
        fc,
        _mm_mul_ss(_mm_cvtsi32_ss(_mm_undefined_ps(), _mm_extract_epi8(ic, 3)),
                   _mm_set_ss(recip_255)),
        _MM_MK_INSERTPS_NDX(0, 3, 0));
#else
    // With no gather support, we'll need to load the four components
    // separately...
    uint32_t pixel = _mm_cvtsi128_si32(ic);
    return _mm_set_ps((float)(pixel >> 24) * recip_255,
                      sRGBunmap((cl_uchar)((pixel >> 16) & 0xFF)),
                      sRGBunmap((cl_uchar)((pixel >> 8) & 0xFF)),
                      sRGBunmap((cl_uchar)(pixel & 0xFF)));
#endif
}


#ifdef __SSE4_1__
#define SELECT_I(cond, a, b) _mm_blendv_epi8(b, a, cond)
#define TEST_ANY_ZERO(v) !_mm_test_all_ones(v)
#define TEST_NONZERO(v) !_mm_test_all_zeros(v, v)
#define TEST_ZERO(v) _mm_test_all_zeros(v, v)
#elif defined(__SSE2__)
// n.b. "ANDNOT" is ~A & B, not A & ~B!!
#define SELECT_I(cond, a, b)                                                   \
    _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b))
#ifdef __x86_64
#define TEST_NONZERO(v)                                                        \
    (_mm_cvtsi128_si64(v)                                                      \
     || _mm_cvtsi128_si64(_mm_shuffle_epi32(v, _MM_SHUFFLE(0, 1, 2, 3))))
#else
#define TEST_NONZERO(v)                                                        \
    (_mm_cvtsi128_si32(v)                                                      \
     || _mm_cvtsi128_si32(                                                     \
         _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 1))                         \
         || _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 2))))  \
     || _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 3))))
#endif
#define TEST_ZERO(v) (!TEST_NONZERO(v))
// The int64 extraction trick won't work here... :/
#define TEST_ANY_ZERO(v)                                                       \
    (!_mm_cvtsi128_si32(v)                                                     \
     || !_mm_cvtsi128_si32(                                                    \
         _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 1))                         \
         || !_mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 2)))) \
     || !_mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, 3))))
#endif

#ifdef __GNUC__
#define __forceinline __attribute__((always_inline))
#endif

static inline __m128i __forceinline _mm_setmone_si128(void)
{
    __m128i junk = _mm_undefined_si128();
    return _mm_cmpeq_epi32(junk, junk);
}

static inline __m128 cl_half_to_float(__m128i h)
{
#ifdef __F16C__
    return _mm_cvtph_ps(h);
#else
    // Type-punning to get direct access to underlying bits
    union {
        __m128 f;
        __m128i i;
    } f32;

    __m128i zero = _mm_setzero_si128();
    __m128i negOne = _mm_setmone_si128();
    __m128i one = _mm_srli_epi32(negOne, 31);
    __m128i h_exp_mask = _mm_srli_epi16(negOne, CL_HALF_MANT_DIG); // = 0x1f

    // Extract sign bit
    __m128i sign =
        _mm_slli_epi32(_mm_unpacklo_epi16(_mm_srli_epi16(h, 15), zero), 31);

    // Extract FP16 exponent and mantissa
    __m128i h_exp =
        _mm_and_si128(_mm_srli_epi16(h, CL_HALF_MANT_DIG - 1), h_exp_mask);
    __m128i h_mant = _mm_and_si128(h, _mm_srli_epi16(negOne, 6) /* 0x3ff */);

    // Remove FP16 exponent bias and convert to int32
    __m128i exp = _mm_sub_epi16(
        h_exp,
        _mm_srli_epi16(negOne, CL_HALF_MANT_DIG + 1) /* CL_HALF_MAX_EXP - 1 */);
#ifdef __SSE4_1__
    exp = _mm_cvtepi16_epi32(exp);
#else
    exp = _mm_unpacklo_epi16(exp, _mm_cmpgt_epi16(zero, exp));
#endif

    // Add FP32 exponent bias
    __m128i f_exp = _mm_add_epi32(
        exp,
        _mm_srli_epi32(negOne, CL_FLT_MANT_DIG + 1) /* CL_FLT_MAX_EXP - 1 */);

    // Convert mantissa to the 32-bit form
    __m128i f_mant = _mm_slli_epi32(_mm_unpacklo_epi16(h_mant, zero),
                                    CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);

    // Note that due to the way SIMD works, we can't have branches--we have to
    // compute all the possible values.

    // Check for NaN / infinity
    __m128i inf_mask = _mm_cmpeq_epi16(h_exp, h_exp_mask);
    inf_mask = _mm_unpacklo_epi16(inf_mask, inf_mask);
    __m128i mant_zero_mask = _mm_cmpeq_epi32(f_mant, zero);
    // n.b. "ANDNOT" is ~A & B, not A & ~B!!
    __m128i nan_mask = _mm_andnot_si128(mant_zero_mask, inf_mask);

    // NaN -> propagate mantissa and silence it
    __m128i f_mant_nan =
        _mm_or_si128(f_mant, _mm_slli_epi32(one, 22) /* 0x400000 */);
    // Infinity -> zero mantissa
    f_mant = SELECT_I(nan_mask, f_mant_nan, f_mant);
    f_exp = SELECT_I(inf_mask,
                     _mm_srli_epi32(negOne, CL_FLT_MANT_DIG) /* 0xff */, f_exp);

    // Check for zero / denormal
    __m128i exp_zero_mask = _mm_cmpeq_epi16(h_exp, zero);
    exp_zero_mask = _mm_unpacklo_epi16(exp_zero_mask, exp_zero_mask);
    __m128i zero_mask = _mm_and_si128(mant_zero_mask, exp_zero_mask);
    // n.b. "ANDNOT" is ~A & B, not A & ~B!!
    __m128i denorm_mask = _mm_andnot_si128(mant_zero_mask, exp_zero_mask);

    if (TEST_NONZERO(denorm_mask))
    {
        // Denormal -> normalize it
        // - Shift mantissa to make most-significant 1 implicit
        // - Adjust exponent accordingly
        __m128i f_mant_mask =
            _mm_srli_epi32(negOne, 32 - (CL_FLT_MANT_DIG - 1));
#if defined(__AVX512VL__) && defined(__AVX512DQ__)
        // We'll probably never get here, since most CPUs that support AVX-512
        // also support F16C. n.b. No +1 yet for the implicit 1 before the radix
        // point--we really do want to shift at least one place
        __m128i shift =
            _mm_and_si128(_mm_sub_epi32(_mm_lzcnt_epi32(f_mant),
                                        _mm_set1_epi32(32 - CL_FLT_MANT_DIG)),
                          denorm_mask);
        f_mant = _mm_sllv_epi32(f_mant, shift);
        shift = _mm_sub_epi32(shift, _mm_and_si128(one, denorm_mask));
#else
        // No packed leading-zero count until AVX-512... gotta do this the hard
        // way
        __m128i shift = zero;
        __m128i shift_mask = _mm_cmpgt_epi32(
            f_mant, _mm_srli_epi32(negOne, 16) /* 0x0000FFFF */);
        __m128i f_mant_shift =
            SELECT_I(shift_mask, f_mant, _mm_slli_epi32(f_mant, 16));
        // n.b. "ANDNOT" is ~A & B, not A & ~B!!
        shift = _mm_add_epi32(
            shift,
            _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 4) /* 16 */));
        // Starting from here, we also need to check that mant >= 0, because
        // PCMPGT does a signed comparison; unsigned comparisons weren't added
        // until AVX-512, which also already has a faster way to count leading
        // zeroes anyway
        shift_mask = _mm_or_si128(
            _mm_cmplt_epi32(f_mant_shift, zero),
            _mm_cmpgt_epi32(f_mant_shift,
                            _mm_srli_epi32(negOne, 8) /* 0x00FFFFFF */));
        f_mant_shift =
            SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 8));
        shift = _mm_add_epi32(
            shift,
            _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 3) /* 8 */));
        shift_mask = _mm_or_si128(
            _mm_cmplt_epi32(f_mant_shift, zero),
            _mm_cmpgt_epi32(f_mant_shift,
                            _mm_srli_epi32(negOne, 4) /* 0x0FFFFFFF */));
        f_mant_shift =
            SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 4));
        shift = _mm_add_epi32(
            shift,
            _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 2) /* 4 */));
        shift_mask = _mm_or_si128(
            _mm_cmplt_epi32(f_mant_shift, zero),
            _mm_cmpgt_epi32(f_mant_shift,
                            _mm_srli_epi32(negOne, 2) /* 0x3FFFFFFF */));
        f_mant_shift =
            SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 2));
        shift = _mm_add_epi32(
            shift,
            _mm_andnot_si128(shift_mask, _mm_slli_epi32(one, 1) /* 2 */));
        shift_mask = _mm_or_si128(
            _mm_cmplt_epi32(f_mant_shift, zero),
            _mm_cmpgt_epi32(f_mant_shift,
                            _mm_srli_epi32(negOne, 1) /* 0x7FFFFFFF */));
        f_mant_shift =
            SELECT_I(shift_mask, f_mant_shift, _mm_slli_epi32(f_mant_shift, 1));
        shift = _mm_add_epi32(shift, _mm_andnot_si128(shift_mask, one));
        f_mant = SELECT_I(denorm_mask,
                          _mm_srli_epi32(f_mant_shift, 32 - CL_FLT_MANT_DIG),
                          f_mant);
        shift = _mm_and_si128(
            _mm_sub_epi32(shift, _mm_set1_epi32(32 - CL_FLT_MANT_DIG + 1)),
            denorm_mask);
#endif
        f_mant = _mm_and_si128(f_mant, f_mant_mask);
        f_exp = _mm_sub_epi32(f_exp, shift);
    }

    // Zero -> zero exponent
    // n.b. "ANDNOT" is ~A & B, not A & ~B!!
    f_exp = _mm_andnot_si128(zero_mask, f_exp);

    f32.i = _mm_or_si128(
        sign, _mm_or_si128(_mm_slli_epi32(f_exp, CL_FLT_MANT_DIG - 1), f_mant));
    return f32.f;
#endif
}
#endif


uint32_t get_format_type_size(const cl_image_format *format)
{
    return get_channel_data_type_size(format->image_channel_data_type);
}

uint32_t get_channel_data_type_size(cl_channel_type channelType)
{
    switch (channelType)
    {
        case CL_SNORM_INT8:
        case CL_UNORM_INT8:
        case CL_SIGNED_INT8:
        case CL_UNSIGNED_INT8: return 1;

        case CL_SNORM_INT16:
        case CL_UNORM_INT16:
        case CL_SIGNED_INT16:
        case CL_UNSIGNED_INT16:
        case CL_HALF_FLOAT:
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
#endif
            return sizeof(cl_short);

        case CL_SIGNED_INT32:
        case CL_UNSIGNED_INT32: return sizeof(cl_int);

        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
#ifdef OBSOLETE_FORAMT
        case CL_UNORM_SHORT_565_REV:
        case CL_UNORM_SHORT_555_REV:
#endif
            return 2;

#ifdef OBSOLETE_FORAMT
        case CL_UNORM_INT_8888:
        case CL_UNORM_INT_8888_REV: return 4;
#endif

        case CL_UNORM_INT_101010:
#ifdef OBSOLETE_FORAMT
        case CL_UNORM_INT_101010_REV:
#endif
            return 4;

        case CL_FLOAT: return sizeof(cl_float);

        default: return 0;
    }
}

uint32_t get_format_channel_count(const cl_image_format *format)
{
    return get_channel_order_channel_count(format->image_channel_order);
}

uint32_t get_channel_order_channel_count(cl_channel_order order)
{
    switch (order)
    {
        case CL_R:
        case CL_A:
        case CL_Rx:
        case CL_INTENSITY:
        case CL_LUMINANCE:
        case CL_DEPTH:
        case CL_DEPTH_STENCIL: return 1;

        case CL_RG:
        case CL_RA:
        case CL_RGx: return 2;

        case CL_RGB:
        case CL_RGBx:
        case CL_sRGB:
        case CL_sRGBx: return 3;

        case CL_RGBA:
        case CL_ARGB:
        case CL_BGRA:
        case CL_sRGBA:
        case CL_sBGRA:
        case CL_ABGR:
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE:
#endif
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE:
#endif
#ifdef CL_ABGR_APPLE
        case CL_ABGR_APPLE:
#endif
            return 4;

        default:
            log_error("%s does not support 0x%x\n", __FUNCTION__, order);
            return 0;
    }
}

cl_channel_type get_channel_type_from_name(const char *name)
{
    struct
    {
        cl_channel_type type;
        const char *name;
    } typeNames[] = { { CL_SNORM_INT8, "CL_SNORM_INT8" },
                      { CL_SNORM_INT16, "CL_SNORM_INT16" },
                      { CL_UNORM_INT8, "CL_UNORM_INT8" },
                      { CL_UNORM_INT16, "CL_UNORM_INT16" },
                      { CL_UNORM_INT24, "CL_UNORM_INT24" },
                      { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" },
                      { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" },
                      { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" },
                      { CL_SIGNED_INT8, "CL_SIGNED_INT8" },
                      { CL_SIGNED_INT16, "CL_SIGNED_INT16" },
                      { CL_SIGNED_INT32, "CL_SIGNED_INT32" },
                      { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" },
                      { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" },
                      { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" },
                      { CL_HALF_FLOAT, "CL_HALF_FLOAT" },
                      { CL_FLOAT, "CL_FLOAT" },
#ifdef CL_SFIXED14_APPLE
                      { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" }
#endif
    };
    for (size_t i = 0; i < sizeof(typeNames) / sizeof(typeNames[0]); i++)
    {
        if (strcmp(typeNames[i].name, name) == 0
            || strcmp(typeNames[i].name + 3, name) == 0)
            return typeNames[i].type;
    }
    return (cl_channel_type)-1;
}

cl_channel_order get_channel_order_from_name(const char *name)
{
    const struct
    {
        cl_channel_order order;
        const char *name;
    } orderNames[] = {
        { CL_R, "CL_R" },
        { CL_A, "CL_A" },
        { CL_Rx, "CL_Rx" },
        { CL_RG, "CL_RG" },
        { CL_RA, "CL_RA" },
        { CL_RGx, "CL_RGx" },
        { CL_RGB, "CL_RGB" },
        { CL_RGBx, "CL_RGBx" },
        { CL_RGBA, "CL_RGBA" },
        { CL_BGRA, "CL_BGRA" },
        { CL_ARGB, "CL_ARGB" },
        { CL_INTENSITY, "CL_INTENSITY" },
        { CL_LUMINANCE, "CL_LUMINANCE" },
        { CL_DEPTH, "CL_DEPTH" },
        { CL_DEPTH_STENCIL, "CL_DEPTH_STENCIL" },
        { CL_sRGB, "CL_sRGB" },
        { CL_sRGBx, "CL_sRGBx" },
        { CL_sRGBA, "CL_sRGBA" },
        { CL_sBGRA, "CL_sBGRA" },
        { CL_ABGR, "CL_ABGR" },
#ifdef CL_1RGB_APPLE
        { CL_1RGB_APPLE, "CL_1RGB_APPLE" },
#endif
#ifdef CL_BGR1_APPLE
        { CL_BGR1_APPLE, "CL_BGR1_APPLE" },
#endif
    };

    for (size_t i = 0; i < sizeof(orderNames) / sizeof(orderNames[0]); i++)
    {
        if (strcmp(orderNames[i].name, name) == 0
            || strcmp(orderNames[i].name + 3, name) == 0)
            return orderNames[i].order;
    }
    return (cl_channel_order)-1;
}


int is_format_signed(const cl_image_format *format)
{
    switch (format->image_channel_data_type)
    {
        case CL_SNORM_INT8:
        case CL_SIGNED_INT8:
        case CL_SNORM_INT16:
        case CL_SIGNED_INT16:
        case CL_SIGNED_INT32:
        case CL_HALF_FLOAT:
        case CL_FLOAT:
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
#endif
            return 1;

        default: return 0;
    }
}

uint32_t get_pixel_size(const cl_image_format *format)
{
    switch (format->image_channel_data_type)
    {
        case CL_SNORM_INT8:
        case CL_UNORM_INT8:
        case CL_SIGNED_INT8:
        case CL_UNSIGNED_INT8: return get_format_channel_count(format);

        case CL_SNORM_INT16:
        case CL_UNORM_INT16:
        case CL_SIGNED_INT16:
        case CL_UNSIGNED_INT16:
        case CL_HALF_FLOAT:
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
#endif
            return get_format_channel_count(format) * sizeof(cl_ushort);

        case CL_SIGNED_INT32:
        case CL_UNSIGNED_INT32:
            return get_format_channel_count(format) * sizeof(cl_int);

        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
#ifdef OBSOLETE_FORAMT
        case CL_UNORM_SHORT_565_REV:
        case CL_UNORM_SHORT_555_REV:
#endif
            return 2;

#ifdef OBSOLETE_FORAMT
        case CL_UNORM_INT_8888:
        case CL_UNORM_INT_8888_REV: return 4;
#endif

        case CL_UNORM_INT_101010:
#ifdef OBSOLETE_FORAMT
        case CL_UNORM_INT_101010_REV:
#endif
            return 4;

        case CL_FLOAT:
            return get_format_channel_count(format) * sizeof(cl_float);

        default: return 0;
    }
}

uint32_t next_power_of_two(uint32_t v)
{
    v--;
    v |= v >> 1;
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;
    v++;
    return v;
}

uint32_t get_pixel_alignment(const cl_image_format *format)
{
    return next_power_of_two(get_pixel_size(format));
}

int get_8_bit_image_format(cl_context context, cl_mem_object_type objType,
                           cl_mem_flags flags, size_t channelCount,
                           cl_image_format *outFormat)
{
    cl_image_format formatList[128];
    unsigned int outFormatCount, i;
    int error;


    /* Make sure each image format is supported */
    if ((error = clGetSupportedImageFormats(context, flags, objType, 128,
                                            formatList, &outFormatCount)))
        return error;


    /* Look for one that is an 8-bit format */
    for (i = 0; i < outFormatCount; i++)
    {
        if (formatList[i].image_channel_data_type == CL_SNORM_INT8
            || formatList[i].image_channel_data_type == CL_UNORM_INT8
            || formatList[i].image_channel_data_type == CL_SIGNED_INT8
            || formatList[i].image_channel_data_type == CL_UNSIGNED_INT8)
        {
            if (!channelCount
                || (channelCount
                    && (get_format_channel_count(&formatList[i])
                        == channelCount)))
            {
                *outFormat = formatList[i];
                return 0;
            }
        }
    }

    return -1;
}

int get_32_bit_image_format(cl_context context, cl_mem_object_type objType,
                            cl_mem_flags flags, size_t channelCount,
                            cl_image_format *outFormat)
{
    cl_image_format formatList[128];
    unsigned int outFormatCount, i;
    int error;


    /* Make sure each image format is supported */
    if ((error = clGetSupportedImageFormats(context, flags, objType, 128,
                                            formatList, &outFormatCount)))
        return error;

    /* Look for one that is an 8-bit format */
    for (i = 0; i < outFormatCount; i++)
    {
        if (formatList[i].image_channel_data_type == CL_UNORM_INT_101010
            || formatList[i].image_channel_data_type == CL_FLOAT
            || formatList[i].image_channel_data_type == CL_SIGNED_INT32
            || formatList[i].image_channel_data_type == CL_UNSIGNED_INT32)
        {
            if (!channelCount
                || (channelCount
                    && (get_format_channel_count(&formatList[i])
                        == channelCount)))
            {
                *outFormat = formatList[i];
                return 0;
            }
        }
    }

    return -1;
}

void print_first_pixel_difference_error(size_t where, const char *sourcePixel,
                                        const char *destPixel,
                                        image_descriptor *imageInfo, size_t y,
                                        size_t thirdDim)
{
    size_t pixel_size = get_pixel_size(imageInfo->format);

    log_error("ERROR: Scanline %d did not verify for image size %d,%d,%d "
              "pitch %d (extra %d bytes)\n",
              (int)y, (int)imageInfo->width, (int)imageInfo->height,
              (int)thirdDim, (int)imageInfo->rowPitch,
              (int)imageInfo->rowPitch
                  - (int)imageInfo->width * (int)pixel_size);
    log_error("Failed at column: %zu   ", where);

    switch (pixel_size)
    {
        case 1:
            log_error("*0x%2.2x vs. 0x%2.2x\n", ((cl_uchar *)sourcePixel)[0],
                      ((cl_uchar *)destPixel)[0]);
            break;
        case 2:
            log_error("*0x%4.4x vs. 0x%4.4x\n", ((cl_ushort *)sourcePixel)[0],
                      ((cl_ushort *)destPixel)[0]);
            break;
        case 3:
            log_error("*{0x%2.2x, 0x%2.2x, 0x%2.2x} vs. "
                      "{0x%2.2x, 0x%2.2x, 0x%2.2x}\n",
                      ((cl_uchar *)sourcePixel)[0],
                      ((cl_uchar *)sourcePixel)[1],
                      ((cl_uchar *)sourcePixel)[2], ((cl_uchar *)destPixel)[0],
                      ((cl_uchar *)destPixel)[1], ((cl_uchar *)destPixel)[2]);
            break;
        case 4:
            log_error("*0x%8.8x vs. 0x%8.8x\n", ((cl_uint *)sourcePixel)[0],
                      ((cl_uint *)destPixel)[0]);
            break;
        case 6:
            log_error(
                "*{0x%4.4x, 0x%4.4x, 0x%4.4x} vs. "
                "{0x%4.4x, 0x%4.4x, 0x%4.4x}\n",
                ((cl_ushort *)sourcePixel)[0], ((cl_ushort *)sourcePixel)[1],
                ((cl_ushort *)sourcePixel)[2], ((cl_ushort *)destPixel)[0],
                ((cl_ushort *)destPixel)[1], ((cl_ushort *)destPixel)[2]);
            break;
        case 8:
            log_error("*0x%16.16" PRIx64 " vs. 0x%16.16" PRIx64 "\n",
                      ((cl_ulong *)sourcePixel)[0], ((cl_ulong *)destPixel)[0]);
            break;
        case 12:
            log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
                      "{0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
                      ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
                      ((cl_uint *)sourcePixel)[2], ((cl_uint *)destPixel)[0],
                      ((cl_uint *)destPixel)[1], ((cl_uint *)destPixel)[2]);
            break;
        case 16:
            log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
                      "{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
                      ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
                      ((cl_uint *)sourcePixel)[2], ((cl_uint *)sourcePixel)[3],
                      ((cl_uint *)destPixel)[0], ((cl_uint *)destPixel)[1],
                      ((cl_uint *)destPixel)[2], ((cl_uint *)destPixel)[3]);
            break;
        default:
            log_error("Don't know how to print pixel size of %zu\n",
                      pixel_size);
            break;
    }
}

size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr,
                         const char *bPtr)
{
    size_t pixel_size = get_pixel_size(imageInfo->format);
    size_t column;

    for (column = 0; column < imageInfo->width; column++)
    {
        switch (imageInfo->format->image_channel_data_type)
        {
            // If the data type is 101010, then ignore bits 31 and 32 when
            // comparing the row
            case CL_UNORM_INT_101010: {
                cl_uint aPixel = *(cl_uint *)aPtr;
                cl_uint bPixel = *(cl_uint *)bPtr;
                if ((aPixel & 0x3fffffff) != (bPixel & 0x3fffffff))
                    return column;
            }
            break;

            // If the data type is 555, ignore bit 15 when comparing the row
            case CL_UNORM_SHORT_555: {
                cl_ushort aPixel = *(cl_ushort *)aPtr;
                cl_ushort bPixel = *(cl_ushort *)bPtr;
                if ((aPixel & 0x7fff) != (bPixel & 0x7fff)) return column;
            }
            break;

            default:
                if (memcmp(aPtr, bPtr, pixel_size) != 0) return column;
                break;
        }

        aPtr += pixel_size;
        bPtr += pixel_size;
    }

    // If we didn't find a difference, return the width of the image
    return column;
}

int random_log_in_range(int minV, int maxV, MTdata d)
{
    double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1);
    int iv = (int)((float)(maxV - minV) * v);
    return iv + minV;
}


#ifdef __SSE2__
static inline __m128i vifloorf(__m128 f)
{
#ifdef __SSE4_1__
    return _mm_cvtps_epi32(
        _mm_round_ps(f, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
#else
    // No packed rounding until SSE4... do this the old-fashioned way
    unsigned int mxcsr = _mm_getcsr();
    _mm_setcsr(mxcsr & ~_MM_ROUND_MASK | _MM_ROUND_DOWN);
    __m128i i = _mm_cvtps_epi32(f);
    _mm_setcsr(mxcsr);
    return i;
#endif
}

static inline __m128 vfloorf(__m128 f)
{
#ifdef __SSE4_1__
    return _mm_round_ps(f, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
#else
    // No packed rounding until SSE4... do this the old-fashioned way
    unsigned int mxcsr = _mm_getcsr();
    _mm_setcsr(mxcsr & ~_MM_ROUND_MASK | _MM_ROUND_DOWN);
    f = _mm_cvtepi32_ps(_mm_cvtps_epi32(f));
    _mm_setcsr(mxcsr);
    return f;
#endif
}

static inline __m128 frac(__m128 a) { return _mm_sub_ps(a, vfloorf(a)); }
#else
static inline float frac(float a) { return a - floorf(a); }
#endif

// Define the addressing functions
#ifdef __SSE2__
typedef __m128i (*AddressFn)(__m128i value, __m128i maxValue);

__m128i NoAddressFn(__m128i value, __m128i maxValue) { return value; }
__m128i RepeatAddressFn(__m128i value, __m128i maxValue)
{
    __m128i minMask = _mm_cmplt_epi32(value, _mm_setzero_si128());
    __m128i maxMask =
        _mm_cmpgt_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128()));
    return SELECT_I(minMask, _mm_add_epi32(value, maxValue),
                    SELECT_I(maxMask, _mm_sub_epi32(value, maxValue), value));
}
__m128i MirroredRepeatAddressFn(__m128i value, __m128i maxValue)
{
#ifdef __SSE4_1__
    return _mm_max_epi32(
        _mm_min_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())),
        _mm_setzero_si128());
#else
    __m128i zero = _mm_setzero_si128();
    maxValue = _mm_add_epi32(maxValue, _mm_setmone_si128());
    __m128i minMask = _mm_cmplt_epi32(value, zero);
    __m128i maxMask = _mm_cmpgt_epi32(value, maxValue);
    return SELECT_I(minMask, zero, SELECT_I(maxMask, maxValue, value));
#endif
}
__m128i ClampAddressFn(__m128i value, __m128i maxValue)
{
    __m128i negOne = _mm_cmpgt_epi32(maxValue, _mm_setzero_si128());
#ifdef __SSE4_1__
    return _mm_max_epi32(_mm_min_epi32(value, maxValue), negOne);
#else
    __m128i minMask = _mm_cmplt_epi32(value, negOne);
    __m128i maxMask = _mm_cmpgt_epi32(value, maxValue);
    return SELECT_I(minMask, negOne, SELECT_I(maxMask, maxValue, value));
#endif
}
__m128i ClampToEdgeNearestFn(__m128i value, __m128i maxValue)
{
#ifdef __SSE4_1__
    return _mm_max_epi32(
        _mm_min_epi32(value, _mm_add_epi32(maxValue, _mm_setmone_si128())),
        _mm_setzero_si128());
#else
    __m128i zero = _mm_setzero_si128();
    maxValue = _mm_add_epi32(maxValue, _mm_setmone_si128());
    __m128i minMask = _mm_cmplt_epi32(value, zero);
    __m128i maxMask = _mm_cmpgt_epi32(value, maxValue);
    return SELECT_I(minMask, zero, SELECT_I(maxMask, maxValue, value));
#endif
}
AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn;

// Note: normalized coords get repeated in normalized space, not unnormalized
// space! hence the special case here
__m128 RepeatNormalizedAddressFn(__m128 fValue, __m128i maxValue)
{
    return _mm_mul_ps(frac(fValue), _mm_cvtepi32_ps(maxValue));
}

static inline __m128 vrintf(__m128 f)
{
#ifdef __SSE4_1__
    return _mm_round_ps(f, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#else
    // No packed rounding until SSE4... do this the old-fashioned way
    return _mm_cvtepi32_ps(_mm_cvtps_epi32(f));
#endif
}

static inline __m128 vfabsf(__m128 f)
{
    return _mm_andnot_ps(_mm_castsi128_ps(_mm_slli_epi32(_mm_setmone_si128(),
                                                         31) /* 0x80000000 */),
                         f);
}

__m128 MirroredRepeatNormalizedAddressFn(__m128 fValue, __m128i maxValue)
{
    // Round to nearest multiple of two.
    // Note halfway values flip flop here due to rte, but they both end up
    // pointing the same place at the end of the day.
    __m128 s_prime = vrintf(_mm_mul_ps(fValue, _mm_set1_ps(0.5f)));
    s_prime = _mm_add_ps(s_prime, s_prime);

    // Reduce to [-1, 1], Apply mirroring -> [0, 1]
    s_prime = vfabsf(_mm_sub_ps(fValue, s_prime));

    // un-normalize
    return _mm_mul_ps(s_prime, _mm_cvtepi32_ps(maxValue));
}
#else
typedef int (*AddressFn)(int value, size_t maxValue);

int NoAddressFn(int value, size_t maxValue) { return value; }
int RepeatAddressFn(int value, size_t maxValue)
{
    if (value < 0)
        value += (int)maxValue;
    else if (value >= (int)maxValue)
        value -= (int)maxValue;
    return value;
}
int MirroredRepeatAddressFn(int value, size_t maxValue)
{
    if (value < 0)
        value = 0;
    else if ((size_t)value >= maxValue)
        value = (int)(maxValue - 1);
    return value;
}
int ClampAddressFn(int value, size_t maxValue)
{
    return (value < -1) ? -1
                        : ((value > (cl_long)maxValue) ? (int)maxValue : value);
}
int ClampToEdgeNearestFn(int value, size_t maxValue)
{
    return (value < 0)
        ? 0
        : (((size_t)value > maxValue - 1) ? (int)maxValue - 1 : value);
}
AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn;

// Note: normalized coords get repeated in normalized space, not unnormalized
// space! hence the special case here
volatile float gFloatHome;
float RepeatNormalizedAddressFn(float fValue, size_t maxValue)
{
#ifndef _MSC_VER // Use original if not the VS compiler.
    // General computation for repeat
    return frac(fValue) * (float)maxValue; // Reduce to [0, 1.f]
#else // Otherwise, use this instead:
    // Home the subtraction to a float to break up the sequence of x87
    // instructions emitted by the VS compiler.
    gFloatHome = fValue - floorf(fValue);
    return gFloatHome * (float)maxValue;
#endif
}

float MirroredRepeatNormalizedAddressFn(float fValue, size_t maxValue)
{
    // Round to nearest multiple of two.
    // Note halfway values flip flop here due to rte, but they both end up
    // pointing the same place at the end of the day.
    float s_prime = 2.0f * rintf(fValue * 0.5f);

    // Reduce to [-1, 1], Apply mirroring -> [0, 1]
    s_prime = fabsf(fValue - s_prime);

    // un-normalize
    return s_prime * (float)maxValue;
}
#endif

struct AddressingTable
{
    AddressingTable()
    {
        static_assert(CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6, "");
        static_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2, "");

        mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE]
              [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = NoAddressFn;
        mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE]
              [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = NoAddressFn;
        mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE]
              [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = RepeatAddressFn;
        mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE]
              [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = RepeatAddressFn;
        mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE]
              [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampToEdgeNearestFn;
        mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE]
              [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampToEdgeLinearFn;
        mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE]
              [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampAddressFn;
        mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE]
              [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampAddressFn;
        mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE]
              [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = MirroredRepeatAddressFn;
        mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE]
              [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = MirroredRepeatAddressFn;
    }

    AddressFn operator[](image_sampler_data *sampler)
    {
        return mTable[(int)sampler->addressing_mode - CL_ADDRESS_NONE]
                     [(int)sampler->filter_mode - CL_FILTER_NEAREST];
    }

    AddressFn mTable[6][2];
};

static AddressingTable sAddressingTable;

bool is_sRGBA_order(cl_channel_order image_channel_order)
{
    switch (image_channel_order)
    {
        case CL_sRGB:
        case CL_sRGBx:
        case CL_sRGBA:
        case CL_sBGRA: return true;
        default: return false;
    }
}

// Format helpers

int has_alpha(const cl_image_format *format)
{
    switch (format->image_channel_order)
    {
        case CL_R: return 0;
        case CL_A: return 1;
        case CL_Rx: return 0;
        case CL_RG: return 0;
        case CL_RA: return 1;
        case CL_RGx: return 0;
        case CL_RGB:
        case CL_sRGB: return 0;
        case CL_RGBx:
        case CL_sRGBx: return 0;
        case CL_RGBA: return 1;
        case CL_BGRA: return 1;
        case CL_ARGB: return 1;
        case CL_ABGR: return 1;
        case CL_INTENSITY: return 1;
        case CL_LUMINANCE: return 0;
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE: return 1;
#endif
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE: return 1;
#endif
        case CL_sRGBA:
        case CL_sBGRA: return 1;
        case CL_DEPTH: return 0;
        default:
            log_error("Invalid image channel order: %d\n",
                      format->image_channel_order);
            return 0;
    }
}

#define PRINT_MAX_SIZE_LOGIC 0

#define SWAP(_a, _b)                                                           \
    do                                                                         \
    {                                                                          \
        _a ^= _b;                                                              \
        _b ^= _a;                                                              \
        _a ^= _b;                                                              \
    } while (0)

void get_max_sizes(
    size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3],
    size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize,
    const cl_ulong maxIndividualAllocSize, // CL_DEVICE_MAX_MEM_ALLOC_SIZE
    const cl_ulong maxTotalAllocSize, // CL_DEVICE_GLOBAL_MEM_SIZE
    cl_mem_object_type image_type, const cl_image_format *format,
    int usingMaxPixelSizeBuffer)
{

    bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D);
    bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY
                    || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY);

    // Validate we have a reasonable max depth for 3D
    if (is3D && maxDepth < 2)
    {
        log_error("ERROR: Requesting max image sizes for 3D images when max "
                  "depth is < 2.\n");
        *numberOfSizes = 0;
        return;
    }
    // Validate we have a reasonable max array size for 1D & 2D image arrays
    if (isArray && maxArraySize < 2)
    {
        log_error("ERROR: Requesting max image sizes for an image array when "
                  "max array size is < 1.\n");
        *numberOfSizes = 0;
        return;
    }

    // Reduce the maximum because we are trying to test the max image
    // dimensions, not the memory allocation
    cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4;
    cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4;
    log_info("Note: max individual allocation adjusted down from %gMB to %gMB "
             "and max total allocation adjusted down from %gMB to %gMB.\n",
             maxIndividualAllocSize / (1024.0 * 1024.0),
             adjustedMaxIndividualAllocSize / (1024.0 * 1024.0),
             maxTotalAllocSize / (1024.0 * 1024.0),
             adjustedMaxTotalAllocSize / (1024.0 * 1024.0));

    // Cap our max allocation to 1.0GB.
    // FIXME -- why?  In the interest of not taking a long time?  We should
    // still test this stuff...
    if (adjustedMaxTotalAllocSize > (cl_ulong)1024 * 1024 * 1024)
    {
        adjustedMaxTotalAllocSize = (cl_ulong)1024 * 1024 * 1024;
        log_info("Limiting max total allocation size to %gMB (down from %gMB) "
                 "for test.\n",
                 adjustedMaxTotalAllocSize / (1024.0 * 1024.0),
                 maxTotalAllocSize / (1024.0 * 1024.0));
    }

    cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize;
    if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize * 2)
        maxAllocSize = adjustedMaxTotalAllocSize / 2;

    size_t raw_pixel_size = get_pixel_size(format);
    // If the test will be creating input (src) buffer of type int4 or float4,
    // number of pixels will be governed by sizeof(int4 or float4) and not
    // sizeof(dest fomat) Also if pixel size is 12 bytes i.e. RGB or RGBx, we
    // adjust it to 16 bytes as GPUs has no concept of 3 channel images. GPUs
    // expand these to four channel RGBA.
    if (usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16;
    size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;

    log_info("Maximums: [%zu x %zu x %zu], raw pixel size %zu bytes, "
             "per-allocation limit %gMB.\n",
             maxWidth, maxHeight, isArray ? maxArraySize : maxDepth,
             raw_pixel_size, (maxAllocSize / (1024.0 * 1024.0)));

    // Keep track of the maximum sizes for each dimension
    size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth };

    switch (image_type)
    {
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            maximum_sizes[1] = maxArraySize;
            maximum_sizes[2] = 1;
            break;
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            maximum_sizes[2] = maxArraySize;
            break;
    }


        // Given one fixed sized dimension, this code finds one or two other
        // dimensions, both with very small size, such that the size does not
        // exceed the maximum passed to this function

#if defined(__x86_64) || defined(__arm64__) || defined(__ppc64__)
    size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 10, 11, 13, 15 };
#else
    size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 11, 13 };
#endif

    static size_t other_size = 0;
    enum
    {
        num_other_sizes = sizeof(other_sizes) / sizeof(size_t)
    };

    (*numberOfSizes) = 0;

    if (image_type == CL_MEM_OBJECT_IMAGE1D)
    {

        size_t M = maximum_sizes[0];

        // Store the size
        sizes[(*numberOfSizes)][0] = M;
        sizes[(*numberOfSizes)][1] = 1;
        sizes[(*numberOfSizes)][2] = 1;
        ++(*numberOfSizes);
    }

    else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY
             || image_type == CL_MEM_OBJECT_IMAGE2D)
    {

        for (int fixed_dim = 0; fixed_dim < 2; ++fixed_dim)
        {

            // Determine the size of the fixed dimension
            size_t M = maximum_sizes[fixed_dim];
            size_t A = max_pixels;

            int x0_dim = !fixed_dim;
            size_t x0 = static_cast<size_t>(
                fmin(fmin(other_sizes[(other_size++) % num_other_sizes], A / M),
                     maximum_sizes[x0_dim]));

            // Store the size
            sizes[(*numberOfSizes)][fixed_dim] = M;
            sizes[(*numberOfSizes)][x0_dim] = x0;
            sizes[(*numberOfSizes)][2] = 1;
            ++(*numberOfSizes);
        }
    }

    else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY
             || image_type == CL_MEM_OBJECT_IMAGE3D)
    {

        // Iterate over dimensions, finding sizes for the non-fixed dimension
        for (int fixed_dim = 0; fixed_dim < 3; ++fixed_dim)
        {

            // Determine the size of the fixed dimension
            size_t M = maximum_sizes[fixed_dim];
            size_t A = max_pixels;

            // Find two other dimensions, x0 and x1
            int x0_dim = (fixed_dim == 0) ? 1 : 0;
            int x1_dim = (fixed_dim == 2) ? 1 : 2;

            // Choose two other sizes for these dimensions
            size_t x0 = static_cast<size_t>(
                fmin(fmin(A / M, maximum_sizes[x0_dim]),
                     other_sizes[(other_size++) % num_other_sizes]));
            // GPUs have certain restrictions on minimum width (row alignment)
            // of images which has given us issues testing small widths in this
            // test (say we set width to 3 for testing, and compute size based
            // on this width and decide it fits within vram ... but GPU driver
            // decides that, due to row alignment requirements, it has to use
            // width of 16 which doesnt fit in vram). For this purpose we are
            // not testing width < 16 for this test.
            if (x0_dim == 0 && x0 < 16) x0 = 16;
            size_t x1 = static_cast<size_t>(
                fmin(fmin(A / M / x0, maximum_sizes[x1_dim]),
                     other_sizes[(other_size++) % num_other_sizes]));

            // Valid image sizes cannot be below 1. Due to the workaround for
            // the xo_dim where x0 is overidden to 16 there might not be enough
            // space left for x1 dimension. This could be a fractional 0.x size
            // that when cast to integer would result in a value 0. In these
            // cases we clamp the size to a minimum of 1.
            if (x1 < 1) x1 = 1;

            // M and x0 cannot be '0' as they derive from clDeviceInfo calls
            assert(x0 > 0 && M > 0);

            // Store the size
            sizes[(*numberOfSizes)][fixed_dim] = M;
            sizes[(*numberOfSizes)][x0_dim] = x0;
            sizes[(*numberOfSizes)][x1_dim] = x1;
            ++(*numberOfSizes);
        }
    }

    // Log the results
    for (int j = 0; j < (int)(*numberOfSizes); j++)
    {
        switch (image_type)
        {
            case CL_MEM_OBJECT_IMAGE1D:
                log_info(" size[%d] = [%zu] (%g MB image)\n", j, sizes[j][0],
                         raw_pixel_size * sizes[j][0] * sizes[j][1]
                             * sizes[j][2] / (1024.0 * 1024.0));
                break;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            case CL_MEM_OBJECT_IMAGE2D:
                log_info(" size[%d] = [%zu %zu] (%g MB image)\n", j,
                         sizes[j][0], sizes[j][1],
                         raw_pixel_size * sizes[j][0] * sizes[j][1]
                             * sizes[j][2] / (1024.0 * 1024.0));
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            case CL_MEM_OBJECT_IMAGE3D:
                log_info(" size[%d] = [%zu %zu %zu] (%g MB image)\n", j,
                         sizes[j][0], sizes[j][1], sizes[j][2],
                         raw_pixel_size * sizes[j][0] * sizes[j][1]
                             * sizes[j][2] / (1024.0 * 1024.0));
                break;
        }
    }
}

float get_max_absolute_error(const cl_image_format *format,
                             image_sampler_data *sampler)
{
    if (sampler->filter_mode == CL_FILTER_NEAREST) return 0.0f;

    switch (format->image_channel_data_type)
    {
        case CL_SNORM_INT8: return 1.0f / 127.0f;
        case CL_UNORM_INT8: return 1.0f / 255.0f;
        case CL_UNORM_INT16: return 1.0f / 65535.0f;
        case CL_SNORM_INT16: return 1.0f / 32767.0f;
        case CL_FLOAT: return CL_FLT_MIN;
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE: return 0x1.0p-14f;
#endif
        case CL_UNORM_SHORT_555:
        case CL_UNORM_SHORT_565: return 1.0f / 31.0f;
        default: return 0.0f;
    }
}

float get_max_relative_error(const cl_image_format *format,
                             image_sampler_data *sampler, int is3D,
                             int isLinearFilter)
{
    float maxError = 0.0f;
    float sampleCount = 1.0f;
    if (isLinearFilter) sampleCount = is3D ? 8.0f : 4.0f;

    // Note that the ULP is defined here as the unit in the last place of the
    // maximum magnitude sample used for filtering.

    // Section 8.3
    switch (format->image_channel_data_type)
    {
        // The spec allows 2 ulps of error for normalized formats
        case CL_SNORM_INT8:
        case CL_UNORM_INT8:
        case CL_SNORM_INT16:
        case CL_UNORM_INT16:
        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
        case CL_UNORM_INT_101010:
            // Maximum sampling error for round to zero normalization based on
            // multiplication by reciprocal (using reciprocal generated in
            // round to +inf mode, so that 1.0 matches spec)
            maxError = 2 * FLT_EPSILON * sampleCount;
            break;

            // If the implementation supports these formats then it will have to
            // allow rounding error here too, because not all 32-bit ints are
            // exactly representable in float
        case CL_SIGNED_INT32:
        case CL_UNSIGNED_INT32: maxError = 1 * FLT_EPSILON; break;
    }


    // Section 8.2
    if (sampler->addressing_mode == CL_ADDRESS_REPEAT
        || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT
        || sampler->filter_mode != CL_FILTER_NEAREST
        || sampler->normalized_coords)
#if defined(__APPLE__)
    {
        if (sampler->filter_mode != CL_FILTER_NEAREST)
        {
            // The maximum
            if (gDeviceType == CL_DEVICE_TYPE_GPU)
                // Some GPUs ain't so accurate
                maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4);
            else
                // The standard method of 2d linear filtering delivers 4.0 ulps
                // of error in round to nearest (8 in rtz).
                maxError += 4.0f * FLT_EPSILON;
        }
        else
            // normalized coordinates will introduce some error into the
            // fractional part of the address, affecting results
            maxError += 4.0f * FLT_EPSILON;
    }
#else
    {
#if !defined(_WIN32)
#warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec
#endif
        // The spec allows linear filters to return any result most of the time.
        // That's fine for implementations but a problem for testing. After all
        // users aren't going to like garbage images.  We have "picked a number"
        // here that we are going to attempt to conform to. Implementations are
        // free to pick another number, like infinity, if they like.
        // We picked a number for you, to provide /some/ sanity
        maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7);
        // ...but this is what the spec allows:
        // maxError = INFINITY;
        // Please feel free to pick any positive number. (NaN wont work.)
    }
#endif

    // The error calculation itself can introduce error
    maxError += FLT_EPSILON * 2;

    return maxError;
}

size_t get_format_max_int(const cl_image_format *format)
{
    switch (format->image_channel_data_type)
    {
        case CL_SNORM_INT8:
        case CL_SIGNED_INT8: return 127;
        case CL_UNORM_INT8:
        case CL_UNSIGNED_INT8: return 255;

        case CL_SNORM_INT16:
        case CL_SIGNED_INT16: return 32767;

        case CL_UNORM_INT16:
        case CL_UNSIGNED_INT16: return 65535;

        case CL_SIGNED_INT32: return 2147483647L;

        case CL_UNSIGNED_INT32: return 4294967295LL;

        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555: return 31;

        case CL_UNORM_INT_101010: return 1023;

        case CL_HALF_FLOAT: return 1 << 10;

#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE: return 16384;
#endif
        default: return 0;
    }
}

int get_format_min_int(const cl_image_format *format)
{
    switch (format->image_channel_data_type)
    {
        case CL_SNORM_INT8:
        case CL_SIGNED_INT8: return -128;
        case CL_UNORM_INT8:
        case CL_UNSIGNED_INT8: return 0;

        case CL_SNORM_INT16:
        case CL_SIGNED_INT16: return -32768;

        case CL_UNORM_INT16:
        case CL_UNSIGNED_INT16: return 0;

        case CL_SIGNED_INT32: return -2147483648LL;

        case CL_UNSIGNED_INT32: return 0;

        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
        case CL_UNORM_INT_101010: return 0;

        case CL_HALF_FLOAT: return -(1 << 10);

#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE: return -16384;
#endif

        default: return 0;
    }
}

cl_half convert_float_to_half(float f)
{
    switch (gFloatToHalfRoundingMode)
    {
        case kRoundToNearestEven: return cl_half_from_float(f, CL_HALF_RTE);
        case kRoundTowardZero: return cl_half_from_float(f, CL_HALF_RTZ);
        default:
            log_error("ERROR: Test internal error -- unhandled or unknown "
                      "float->half rounding mode.\n");
            exit(-1);
            return 0xffff;
    }
}

cl_ulong get_image_size(image_descriptor const *imageInfo)
{
    cl_ulong imageSize;

    // Assumes rowPitch and slicePitch are always correctly defined
    if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1)
    {
        imageSize = (size_t)compute_mipmapped_image_size(*imageInfo);
    }
    else
    {
        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE1D: imageSize = imageInfo->rowPitch; break;
            case CL_MEM_OBJECT_IMAGE2D:
                imageSize = imageInfo->height * imageInfo->rowPitch;
                break;
            case CL_MEM_OBJECT_IMAGE3D:
                imageSize = imageInfo->depth * imageInfo->slicePitch;
                break;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                imageSize = imageInfo->arraySize * imageInfo->slicePitch;
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                imageSize = imageInfo->arraySize * imageInfo->slicePitch;
                break;
            default:
                log_error("ERROR: Cannot identify image type %x\n",
                          imageInfo->type);
                abort();
        }
    }
    return imageSize;
}

// Calculate image size in megabytes (strictly, mebibytes). Result is rounded
// up.
cl_ulong get_image_size_mb(image_descriptor const *imageInfo)
{
    cl_ulong imageSize = get_image_size(imageInfo);
    cl_ulong mb = imageSize / (1024 * 1024);
    if (imageSize % (1024 * 1024) > 0)
    {
        mb += 1;
    }
    return mb;
}


uint64_t gRoundingStartValue = 0;


void escape_inf_nan_values(char *data, size_t allocSize)
{
    // filter values with 8 not-quite-highest bits
    unsigned int *intPtr = (unsigned int *)data;
    for (size_t i = 0; i<allocSize>> 2; i++)
    {
        if ((intPtr[i] & 0x7F800000) == 0x7F800000) intPtr[i] ^= 0x40000000;
    }

    // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits
    // = 0x7C00 are special)
    unsigned short *shortPtr = (unsigned short *)data;
    for (size_t i = 0; i<allocSize>> 1; i++)
    {
        if ((shortPtr[i] & 0x7C00) == 0x7C00) shortPtr[i] ^= 0x4000;
    }
}

char *generate_random_image_data(image_descriptor *imageInfo,
                                 BufferOwningPtr<char> &P, MTdata d)
{
    size_t allocSize = static_cast<size_t>(get_image_size(imageInfo));
    size_t pixelRowBytes = imageInfo->width * get_pixel_size(imageInfo->format);
    size_t i;

    if (imageInfo->num_mip_levels > 1)
        allocSize =
            static_cast<size_t>(compute_mipmapped_image_size(*imageInfo));

#if defined(__APPLE__)
    char *data = NULL;
    if (gDeviceType == CL_DEVICE_TYPE_CPU)
    {
        size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192;

        void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE,
                         MAP_ANON | MAP_PRIVATE, 0, 0);
        intptr_t data_end = (intptr_t)map + mapSize - 4096;
        data = (char *)(data_end - (intptr_t)allocSize);

        mprotect(map, 4096, PROT_NONE);
        mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
        P.reset(data, map, mapSize, allocSize);
    }
    else
    {
        data = (char *)malloc(allocSize);
        P.reset(data, NULL, 0, allocSize);
    }
#else
    P.reset(NULL); // Free already allocated memory first, then try to allocate
                   // new block.
    char *data =
        (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format));
    P.reset(data, NULL, 0, allocSize, true);
#endif

    if (data == NULL)
    {
        log_error("ERROR: Unable to malloc %zu bytes for "
                  "generate_random_image_data\n",
                  allocSize);
        return 0;
    }

    if (gTestRounding)
    {
        // Special case: fill with a ramp from 0 to the size of the type
        size_t typeSize = get_format_type_size(imageInfo->format);
        switch (typeSize)
        {
            case 1: {
                char *ptr = data;
                for (i = 0; i < allocSize; i++)
                    ptr[i] = (cl_char)(i + gRoundingStartValue);
            }
            break;
            case 2: {
                cl_short *ptr = (cl_short *)data;
                for (i = 0; i < allocSize / 2; i++)
                    ptr[i] = (cl_short)(i + gRoundingStartValue);
            }
            break;
            case 4: {
                cl_int *ptr = (cl_int *)data;
                for (i = 0; i < allocSize / 4; i++)
                    ptr[i] = (cl_int)(i + gRoundingStartValue);
            }
            break;
        }

        // Note: inf or nan float values would cause problems, although we don't
        // know this will actually be a float, so we just know what to look for
        escape_inf_nan_values(data, allocSize);
        return data;
    }

    // Otherwise, we should be able to just fill with random bits no matter what
    cl_uint *p = (cl_uint *)data;
    for (i = 0; i + 4 <= allocSize; i += 4) p[i / 4] = genrand_int32(d);

    for (; i < allocSize; i++) data[i] = genrand_int32(d);

    // Note: inf or nan float values would cause problems, although we don't
    // know this will actually be a float, so we just know what to look for
    escape_inf_nan_values(data, allocSize);

    if (/*!gTestMipmaps*/ imageInfo->num_mip_levels < 2)
    {
        // Fill unused edges with -1, NaN for float
        if (imageInfo->rowPitch > pixelRowBytes)
        {
            size_t height = 0;

            switch (imageInfo->type)
            {
                case CL_MEM_OBJECT_IMAGE2D:
                case CL_MEM_OBJECT_IMAGE3D:
                case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                    height = imageInfo->height;
                    break;
                case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                    height = imageInfo->arraySize;
                    break;
            }

            // Fill in the row padding regions
            for (i = 0; i < height; i++)
            {
                size_t offset = i * imageInfo->rowPitch + pixelRowBytes;
                size_t length = imageInfo->rowPitch - pixelRowBytes;
                memset(data + offset, 0xff, length);
            }
        }

        // Fill in the slice padding regions, if necessary:

        size_t slice_dimension = imageInfo->height;
        if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
        {
            slice_dimension = imageInfo->arraySize;
        }

        if (imageInfo->slicePitch > slice_dimension * imageInfo->rowPitch)
        {
            size_t depth = 0;
            switch (imageInfo->type)
            {
                case CL_MEM_OBJECT_IMAGE2D:
                case CL_MEM_OBJECT_IMAGE3D: depth = imageInfo->depth; break;
                case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                    depth = imageInfo->arraySize;
                    break;
            }

            for (i = 0; i < depth; i++)
            {
                size_t offset = i * imageInfo->slicePitch
                    + slice_dimension * imageInfo->rowPitch;
                size_t length = imageInfo->slicePitch
                    - slice_dimension * imageInfo->rowPitch;
                memset(data + offset, 0xff, length);
            }
        }
    }

    return data;
}

#ifdef __SSE2__
#define CLAMP_FLOAT_V(v)                                                       \
    (_mm_max_ps(_mm_min_ps(v, _mm_set1_ps(1.f)), _mm_set1_ps(-1.f)))
#define CLAMP_FLOAT(v)                                                         \
    (_mm_max_ss(_mm_min_ss(v, _mm_set_ss(1.f)), _mm_set_ss(-1.f)))

#ifdef __SSE4_1__
#define SET_ALPHA_1(v)                                                         \
    (_mm_insert_ps(v, _mm_set_ss(1.f), _MM_MK_INSERTPS_NDX(0, 3, 0)))
#define SELECT_F(cond, a, b) _mm_blendv_ps(b, a, cond)
#define EXTRACT_I(v, i) _mm_extract_epi32(v, i)
#ifdef __x86_64
#define EXTRACT_I64(v, i) _mm_extract_epi64(v, i)
#endif
#else
#define SET_ALPHA_1(v) (_mm_movelh_ps(v, _mm_set_ps(0.f, 0.f, 1.f, 0.f)))
// n.b. "ANDNOT" is ~A & B, not A & ~B!!
#define SELECT_F(cond, a, b)                                                   \
    _mm_or_ps(_mm_and_ps(cond, a), _mm_andnot_ps(cond, b))
#define EXTRACT_I(v, i)                                                        \
    _mm_cvtsi128_si32(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 1, i)))
#ifdef __x86_64
#define EXTRACT_I64(v, i)                                                      \
    _mm_cvtsi128_si64(_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 2 * i + 1, 2 * i)))
#endif
#endif

static __m128 read_image_pixel_float(void *imageData,
                                     image_descriptor *imageInfo, __m128i coord,
                                     int lod)
{
    size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
           depth_lod = imageInfo->depth;
    size_t slice_pitch_lod = 0, row_pitch_lod = 0;

    if (imageInfo->num_mip_levels > 1)
    {
        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE3D:
                depth_lod =
                    (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                height_lod =
                    (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            default:
                width_lod =
                    (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
        }
        row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
        if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
            slice_pitch_lod = row_pitch_lod;
        else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
                 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
            slice_pitch_lod = row_pitch_lod * height_lod;
    }
    else
    {
        row_pitch_lod = imageInfo->rowPitch;
        slice_pitch_lod = imageInfo->slicePitch;
    }

    __m128i extent = _mm_set_epi32(
        0,
        imageInfo->arraySize != 0 ? (int)imageInfo->arraySize : (int)depth_lod,
        (int)height_lod, (int)width_lod);
    __m128i zero = _mm_setzero_si128();
    __m128i minMask = _mm_cmplt_epi32(coord, zero);
    __m128i maxMask =
        _mm_cmpgt_epi32(coord, _mm_add_epi32(extent, _mm_setmone_si128()));
    __m128i extentMask = _mm_cmpeq_epi32(extent, zero);
    __m128i boundsMask =
        _mm_or_si128(_mm_andnot_si128(extentMask, maxMask), minMask);
    if (TEST_NONZERO(boundsMask))
        return has_alpha(imageInfo->format) ? _mm_setzero_ps()
                                            : _mm_set_ps(1.f, 0.f, 0.f, 0.f);

    const cl_image_format *format = imageInfo->format;

    __m128 tempData;
    // Predeclare a bunch of reciprocal constants so GCC doesn't use expensive
    // divisions to compute them in our code
    static const float recip_31 = 1.0f / 31.0f;
    static const float recip_63 = 1.0f / 63.0f;
    static const float recip_127 = 1.0f / 127.0f;
    static const float recip_255 = 1.0f / 255.0f;
    static const float recip_1023 = 1.0f / 1023.0f;
    static const float recip_32767 = 1.0f / 32767.0f;
    static const float recip_65535 = 1.0f / 65535.0f;

    // Advance to the right spot
    char *ptr = (char *)imageData;
    size_t pixelSize = get_pixel_size(format);
    __m128i pitch_lod = _mm_set_epi32(0, (int)slice_pitch_lod,
                                      (int)row_pitch_lod, (int)pixelSize);

    __m128i offsetA = _mm_mul_epu32(coord, pitch_lod);
    __m128i offsetB =
        _mm_mul_epu32(_mm_shuffle_epi32(coord, _MM_SHUFFLE(2, 3, 0, 1)),
                      _mm_shuffle_epi32(pitch_lod, _MM_SHUFFLE(2, 3, 0, 1)));
#ifdef __x86_64
    ptr += EXTRACT_I64(offsetB, 0) + EXTRACT_I64(offsetA, 1)
        + EXTRACT_I64(offsetA, 0);
#else
    // Using PHADDD doesn't gain us much...
    ptr +=
        EXTRACT_I(offsetB, 0) + EXTRACT_I(offsetA, 2) + EXTRACT_I(offsetA, 0);
#endif

    // OpenCL only supports reading floats from certain formats
    size_t channelCount = get_format_channel_count(format);
    switch (format->image_channel_data_type)
    {
        case CL_SNORM_INT8: {
            cl_char *dPtr = (cl_char *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData = SET_ALPHA_1(CLAMP_FLOAT(
                        _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]),
                                   _mm_set_ss(recip_127))));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x7F00, 1);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr),
                                             0x7F00 | dPtr[2], 1);
                    break;
                case 4: pixel = _mm_loadu_si32(ptr); break;
            }
            if (channelCount != 1)
            {
#ifdef __SSE4_1__
                tempData = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(pixel));
#else
                __m128i signMask;
                signMask = _mm_cmpgt_epi8(_mm_setzero_si128(), pixel);
                pixel = _mm_unpacklo_epi8(pixel, signMask);
                signMask = _mm_unpacklo_epi8(signMask, signMask);
                tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pixel, signMask));
#endif
                tempData =
                    CLAMP_FLOAT_V(_mm_mul_ps(tempData, _mm_set1_ps(recip_127)));
            }
            break;
        }

        case CL_UNORM_INT8: {
            unsigned char *dPtr = (unsigned char *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    if ((is_sRGBA_order(
                            imageInfo->format->image_channel_order)))
                        tempData = SET_ALPHA_1(_mm_set_ss(sRGBunmap(dPtr[0])));
                    else
                        tempData = SET_ALPHA_1(_mm_mul_ss(
                            _mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]),
                            _mm_set_ss(recip_255)));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0xFF00, 1);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr),
                                             0xFF00 | dPtr[2], 1);
                    break;
                case 4: pixel = _mm_loadu_si32(ptr);
#ifdef CL_1RGB_APPLE
                    if (format->image_channel_order == CL_1RGB_APPLE)
#ifdef __SSE4_1__
                        pixel = _mm_insert_epi8(pixel, 0xFF, 0);
#else
                        pixel =
                            _mm_or_si128(pixel,
                                         _mm_bsrli_si128(_mm_setmone_si128(),
                                                         15) /* 0x000000FF */);
#endif
#endif
#ifdef CL_BGR1_APPLE
                    if (format->image_channel_order == CL_BGR1_APPLE)
#ifdef __SSE4_1__
                        pixel = _mm_insert_epi8(pixel, 0xFF, 3);
#else
                        pixel = _mm_or_si128(
                            pixel,
                            _mm_bslli_si128(
                                _mm_bsrli_si128(_mm_setmone_si128(), 15),
                                3) /* 0xFF000000 */);
#endif
#endif
                    break;
            }
            if (channelCount != 1)
            {
                if (is_sRGBA_order(imageInfo->format->image_channel_order))
                    tempData = sRGBunmap(pixel);
                else
                {
#ifdef __SSE4_1__
                    tempData = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(pixel));
#else
                    __m128i zero = _mm_setzero_si128();
                    tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(
                        _mm_unpacklo_epi8(pixel, zero), zero));
#endif
                    tempData = _mm_mul_ps(tempData, _mm_set1_ps(recip_255));
                }
            }
            break;
        }

        case CL_SIGNED_INT8: {
            cl_char *dPtr = (cl_char *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData =
                        SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100, 1);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr),
                                             0x0100 | dPtr[2], 1);
                    break;
                case 4: pixel = _mm_loadu_si32(ptr); break;
            }
            if (channelCount != 1)
            {
#ifdef __SSE4_1__
                tempData = _mm_cvtepi32_ps(_mm_cvtepi8_epi32(pixel));
#else
                __m128i signMask;
                signMask = _mm_cmpgt_epi8(_mm_setzero_si128(), pixel);
                pixel = _mm_unpacklo_epi8(pixel, signMask);
                signMask = _mm_unpacklo_epi8(signMask, signMask);
                tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(pixel, signMask));
#endif
            }
            break;
        }

        case CL_UNSIGNED_INT8: {
            cl_uchar *dPtr = (cl_uchar *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData =
                        SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr), 0x0100, 1);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(_mm_loadu_si16(ptr),
                                             0x0100 | dPtr[2], 1);
                    break;
                case 4: pixel = _mm_loadu_si32(ptr); break;
            }
            if (channelCount != 1)
            {
#ifdef __SSE4_1__
                tempData = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(pixel));
#else
                __m128i zero = _mm_setzero_si128();
                tempData = _mm_cvtepi32_ps(
                    _mm_unpacklo_epi16(_mm_unpacklo_epi8(pixel, zero), zero));
#endif
            }
            break;
        }

        case CL_SNORM_INT16: {
            cl_short *dPtr = (cl_short *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData = SET_ALPHA_1(CLAMP_FLOAT(
                        _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]),
                                   _mm_set_ss(recip_32767))));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 0x7FFF, 3);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(
                        _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2),
                        0x7FFF, 3);
                    break;
                case 4: pixel = _mm_loadu_si64(ptr); break;
            }
            if (channelCount != 1)
            {
#ifdef __SSE4_1__
                tempData = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(pixel));
#else
                tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(
                    pixel, _mm_cmpgt_epi16(_mm_setzero_si128(), pixel)));
#endif
                tempData = CLAMP_FLOAT_V(
                    _mm_mul_ps(tempData, _mm_set1_ps(recip_32767)));
            }
            break;
        }

        case CL_UNORM_INT16: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData = SET_ALPHA_1(
                        _mm_mul_ss(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]),
                                   _mm_set_ss(recip_65535)));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 0xFFFF, 3);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(
                        _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2),
                        0xFFFF, 3);
                    break;
                case 4: pixel = _mm_loadu_si64(ptr); break;
            }
            if (channelCount != 1)
            {
#ifdef __SSE4_1__
                tempData = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(pixel));
#else
                tempData = _mm_cvtepi32_ps(
                    _mm_unpacklo_epi16(pixel, _mm_setzero_si128()));
#endif
                tempData = _mm_mul_ps(tempData, _mm_set1_ps(recip_65535));
            }
            break;
        }

        case CL_SIGNED_INT16: {
            cl_short *dPtr = (cl_short *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData =
                        SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 1, 3);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(
                        _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 1,
                        3);
                    break;
                case 4: pixel = _mm_loadu_si64(ptr); break;
            }
            if (channelCount != 1)
#ifdef __SSE4_1__
                tempData = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(pixel));
#else
                tempData = _mm_cvtepi32_ps(_mm_unpacklo_epi16(
                    pixel, _mm_cmpgt_epi16(_mm_setzero_si128(), pixel)));
#endif
            break;
        }

        case CL_UNSIGNED_INT16: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData =
                        SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si32(ptr), 1, 3);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(
                        _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2), 1,
                        3);
                    break;
                case 4: pixel = _mm_loadu_si64(ptr); break;
            }
            if (channelCount != 1)
#ifdef __SSE4_1__
                tempData = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(pixel));
#else
                tempData = _mm_cvtepi32_ps(
                    _mm_unpacklo_epi16(pixel, _mm_setzero_si128()));
#endif
            break;
        }

        case CL_HALF_FLOAT: {
            cl_half *dPtr = (cl_half *)ptr;
            __m128i h;
            switch (channelCount)
            {
                case 1:
#ifdef __F16C__
                    tempData = SET_ALPHA_1(_mm_set_ss(_cvtsh_ss(dPtr[0])));
#else
                    tempData =
                        SET_ALPHA_1(_mm_set_ss(cl_half_to_float(dPtr[0])));
#endif
                    break;
                case 2:
                    h = _mm_insert_epi16(_mm_loadu_si32(ptr), 0x3C00, 3);
                    break;
                case 3:
                    h = _mm_insert_epi16(
                        _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2),
                        0x3C00, 3);
                    break;
                case 4: h = _mm_loadu_si64(ptr); break;
            }

            if (channelCount == 1) break;

            tempData = cl_half_to_float(h);
            break;
        }

        case CL_SIGNED_INT32: {
            cl_int *dPtr = (cl_int *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData =
                        SET_ALPHA_1(_mm_cvtsi32_ss(_mm_setzero_ps(), dPtr[0]));
                    break;
                case 2:
#ifdef __SSE4_1__
                    pixel = _mm_insert_epi32(_mm_loadu_si64(ptr), 1, 3);
#else
                    pixel = _mm_insert_epi16(_mm_loadu_si64(ptr), 1, 6);
#endif
                    break;
                case 3:
#ifdef __SSE4_1__
                    pixel = _mm_insert_epi32(
                        _mm_insert_epi32(_mm_loadu_si64(ptr), dPtr[2], 2), 1,
                        3);
#else
                    pixel = _mm_or_si128(_mm_loadu_si64(ptr),
                                         _mm_set_epi32(1, dPtr[2], 0, 0));
#endif
                    break;
                case 4: pixel = _mm_loadu_si128((__m128i_u *)ptr); break;
            }
            if (channelCount != 1) tempData = _mm_cvtepi32_ps(pixel);
            break;
        }

        case CL_UNSIGNED_INT32: {
            cl_uint *dPtr = (cl_uint *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
#ifdef __x86_64
                    tempData =
                        SET_ALPHA_1(_mm_cvtsi64_ss(_mm_setzero_ps(), dPtr[0]));
#else
                    tempData = SET_ALPHA_1(_mm_set_ss((float)dPtr[0]));
#endif
                    break;
                case 2:
#ifdef __SSE4_1__
                    pixel = _mm_insert_epi32(_mm_loadu_si64(ptr), 1, 3);
#else
                    pixel = _mm_insert_epi16(_mm_loadu_si64(ptr), 1, 6);
#endif
                    break;
                case 3:
#ifdef __SSE4_1__
                    pixel = _mm_insert_epi32(
                        _mm_insert_epi32(_mm_loadu_si64(ptr), dPtr[2], 2), 1,
                        3);
#else
                    pixel = _mm_or_si128(_mm_loadu_si64(ptr),
                                         _mm_set_epi32(1, dPtr[2], 0, 0));
#endif
                    break;
                case 4: pixel = _mm_loadu_si128((__m128i_u *)ptr); break;
            }
            if (channelCount != 1)
            {
                // Unfortunately, no instruction for converting unsigned 32-bit
                // integers to float exists until AVX-512; nor is there an
                // instruction for converting packed 64-bit integers to float
                // until same
#ifdef __AVX512VL__
                tempData = _mm_cvtepu32_ps(pixel);
#elif defined(__SSE4_1__)
                // The following is based on the unoptimized output of GCC for
                // scalars
                __m128i negOne = _mm_setmone_si128();
                if (!_mm_testz_si128(
                        pixel, _mm_slli_epi32(negOne, 31) /* 0x80000000 */))
                {
                    __m128i one = _mm_srli_epi32(negOne, 31); // = 1;
                    __m128i reducedPixel = _mm_or_si128(
                        _mm_srli_epi32(pixel, 1), _mm_and_si128(pixel, one));
                    tempData = _mm_cvtepi32_ps(reducedPixel);
                    tempData = _mm_add_ps(tempData, tempData);
                    tempData = SELECT_F(_mm_castsi128_ps(pixel), tempData,
                                        _mm_cvtepi32_ps(pixel));
                }
                else
                    tempData = _mm_cvtepi32_ps(pixel);
#else
                // Testing contents of vectors is unwieldy without SSE4.1
                __m128i negOne = _mm_setmone_si128();
                __m128i one = _mm_srli_epi32(negOne, 31); // = 1;
                __m128i reducedPixel = _mm_or_si128(_mm_srli_epi32(pixel, 1),
                                                    _mm_and_si128(pixel, one));
                tempData = _mm_cvtepi32_ps(reducedPixel);
                tempData = _mm_add_ps(tempData, tempData);
                __m128 tempData2 = _mm_cvtepi32_ps(pixel);
                __m128 mask = _mm_cmpgt_ps(_mm_setzero_ps(), tempData2);
                tempData = SELECT_F(mask, tempData, tempData2);
#endif
            }
            break;
        }

        case CL_UNORM_SHORT_565: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
#ifdef __AVX2__
            __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si16(dPtr));
            pixel = _mm_insert_epi32(
                _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 5, 11)), 1, 3);
#else
            // Shifts may as well be scalar, since before AVX2 there are no
            // vector shift amounts
            __m128i pixel =
                _mm_set_epi32(1, dPtr[0], dPtr[0] >> 5, dPtr[0] >> 11);
#endif
            pixel = _mm_and_si128(pixel, _mm_set_epi32(1, 0x1f, 0x3f, 0x1f));
            tempData =
                _mm_mul_ps(_mm_cvtepi32_ps(pixel),
                           _mm_set_ps(1.0f, recip_31, recip_63, recip_31));
            break;
        }

        case CL_UNORM_SHORT_555: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
#ifdef __AVX2__
            __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si16(dPtr));
            pixel = _mm_insert_epi32(
                _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 5, 10)), 0x1F, 3);
#else
            __m128i pixel =
                _mm_set_epi32(0x1F, dPtr[0], dPtr[0] >> 5, dPtr[0] >> 10);
#endif
            pixel = _mm_and_si128(pixel, _mm_set1_epi16(0x1F));
            tempData =
                _mm_mul_ps(_mm_cvtepi32_ps(pixel), _mm_set1_ps(recip_31));
            break;
        }

        case CL_UNORM_INT_101010: {
            cl_uint *dPtr = (cl_uint *)ptr;
#ifdef __AVX2__
            __m128i pixel = _mm_broadcastd_epi32(_mm_loadu_si32(dPtr));
            pixel = _mm_insert_epi32(
                _mm_srlv_epi32(pixel, _mm_set_epi32(0, 0, 10, 20)), 0x3ff, 3);
#else
            __m128i pixel =
                _mm_set_epi32(0x3ff, dPtr[0], dPtr[0] >> 10, dPtr[0] >> 20);
#endif
            pixel = _mm_and_si128(pixel, _mm_set1_epi16(0x3ff));
            tempData =
                _mm_mul_ps(_mm_cvtepi32_ps(pixel), _mm_set1_ps(recip_1023));
            break;
        }

        case CL_FLOAT: {
            float *dPtr = (float *)ptr;
            switch (channelCount)
            {
                case 1: tempData = SET_ALPHA_1(_mm_load_ss(dPtr)); break;
                case 2:
                    tempData = _mm_loadl_pi(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f),
                                            (__m64 *)ptr);
                    break;
                case 3:
                    tempData = _mm_loadl_pi(
                        _mm_set_ps(1.0f, dPtr[2], 0.0f, 0.0f), (__m64 *)ptr);
                    break;
                case 4: tempData = _mm_loadu_ps(dPtr); break;
            }
            break;
        }
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            __m128i pixel;
            switch (channelCount)
            {
                case 1:
                    tempData = SET_ALPHA_1(
                        _mm_mul_ss(_mm_cvtsi32_ss((int)dPtr[0] - 16384),
                                   _mm_set_ss(0x1.0p-14f)));
                    break;
                case 2:
                    pixel = _mm_insert_epi16(_mm_loadu_si32(ptr),
                                             (1 << 14) + 16384, 3);
                    break;
                case 3:
                    pixel = _mm_insert_epi16(
                        _mm_insert_epi16(_mm_loadu_si32(ptr), dPtr[2], 2),
                        (1 << 14) + 16384, 3);
                    break;
                case 4: pixel = _mm_loadu_si64(ptr); break;
            }
            if (channelCount != 1)
            {
#ifdef __SSE4_1__
                pixel = _mm_cvtepu16_epi32(pixel);
#else
                pixel = _mm_unpacklo_epi16(pixel, _mm_setzero_si128());
#endif
                tempData = _mm_mul_ps(_mm_cvtepi32_ps(_mm_add_epi32(pixel, _mm_slli_epi32(_mm_setmone_si128(), 14) /* -16384 */, _mm_set1_ps(0x1.0p-14f));
            }
            break;
        }
#endif
    }

    switch (format->image_channel_order)
    {
        case CL_R:
        case CL_Rx:
        case CL_RG:
        case CL_RGx:
        case CL_RGB:
        case CL_RGBx:
        case CL_sRGB:
        case CL_sRGBx:
        case CL_RGBA:
        case CL_sRGBA:
        case CL_DEPTH:
            /* Already correct */
            return tempData;
        case CL_A:
            return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 1, 1, 1));
        case CL_RA:
            return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(1, 2, 2, 0));
        case CL_ARGB:
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE:
#endif
            return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 3, 2, 1));
        case CL_ABGR:
            return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 1, 2, 3));
        case CL_BGRA:
        case CL_sBGRA:
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE:
#endif
            return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(3, 0, 1, 2));
        case CL_INTENSITY:
            return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(0, 0, 0, 0));
        case CL_LUMINANCE:
            return _mm_shuffle_ps(tempData, tempData, _MM_SHUFFLE(3, 0, 0, 0));
        default:
            log_error("Invalid format:");
            print_header(format, true);
            break;
    }
    return tempData;
}

void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
                            int y, int z, float *outData, int lod)
{
    _mm_storeu_ps(outData,
                  read_image_pixel_float(imageData, imageInfo,
                                         _mm_set_epi32(0, z, y, x), lod));
}

#else

#define CLAMP_FLOAT(v) (fmaxf(fminf(v, 1.f), -1.f))

void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
                            int y, int z, float *outData, int lod)
{
    size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
           depth_lod = imageInfo->depth;
    size_t slice_pitch_lod = 0, row_pitch_lod = 0;

    if (imageInfo->num_mip_levels > 1)
    {
        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE3D:
                depth_lod =
                    (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                height_lod =
                    (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            default:
                width_lod =
                    (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
        }
        row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
        if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
            slice_pitch_lod = row_pitch_lod;
        else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
                 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
            slice_pitch_lod = row_pitch_lod * height_lod;
    }
    else
    {
        row_pitch_lod = imageInfo->rowPitch;
        slice_pitch_lod = imageInfo->slicePitch;
    }
    if (x < 0 || y < 0 || z < 0 || x >= (int)width_lod
        || (height_lod != 0 && y >= (int)height_lod)
        || (depth_lod != 0 && z >= (int)depth_lod)
        || (imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize))
    {
        outData[0] = outData[1] = outData[2] = outData[3] = 0;
        if (!has_alpha(imageInfo->format)) outData[3] = 1;
        return;
    }

    const cl_image_format *format = imageInfo->format;

    unsigned int i;
    float tempData[4];

    // Advance to the right spot
    char *ptr = (char *)imageData;
    size_t pixelSize = get_pixel_size(format);

    ptr += z * slice_pitch_lod + y * row_pitch_lod + x * pixelSize;

    // OpenCL only supports reading floats from certain formats
    size_t channelCount = get_format_channel_count(format);
    switch (format->image_channel_data_type)
    {
        case CL_SNORM_INT8: {
            cl_char *dPtr = (cl_char *)ptr;
            for (i = 0; i < channelCount; i++)
                tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 127.0f);
            break;
        }

        case CL_UNORM_INT8: {
            unsigned char *dPtr = (unsigned char *)ptr;
            for (i = 0; i < channelCount; i++)
            {
                if ((is_sRGBA_order(imageInfo->format->image_channel_order))
                    && i < 3) // only RGB need to be converted for sRGBA
                    tempData[i] = sRGBunmap(dPtr[i]);
                else
                    tempData[i] = (float)dPtr[i] / 255.0f;
            }
            break;
        }

        case CL_SIGNED_INT8: {
            cl_char *dPtr = (cl_char *)ptr;
            for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
            break;
        }

        case CL_UNSIGNED_INT8: {
            cl_uchar *dPtr = (cl_uchar *)ptr;
            for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
            break;
        }

        case CL_SNORM_INT16: {
            cl_short *dPtr = (cl_short *)ptr;
            for (i = 0; i < channelCount; i++)
                tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 32767.0f);
            break;
        }

        case CL_UNORM_INT16: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            for (i = 0; i < channelCount; i++)
                tempData[i] = (float)dPtr[i] / 65535.0f;
            break;
        }

        case CL_SIGNED_INT16: {
            cl_short *dPtr = (cl_short *)ptr;
            for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
            break;
        }

        case CL_UNSIGNED_INT16: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
            break;
        }

        case CL_HALF_FLOAT: {
            cl_half *dPtr = (cl_half *)ptr;
            for (i = 0; i < channelCount; i++)
                tempData[i] = cl_half_to_float(dPtr[i]);
            break;
        }

        case CL_SIGNED_INT32: {
            cl_int *dPtr = (cl_int *)ptr;
            for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
            break;
        }

        case CL_UNSIGNED_INT32: {
            cl_uint *dPtr = (cl_uint *)ptr;
            for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
            break;
        }

        case CL_UNORM_SHORT_565: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            tempData[0] = (float)(dPtr[0] >> 11) / (float)31;
            tempData[1] = (float)((dPtr[0] >> 5) & 63) / (float)63;
            tempData[2] = (float)(dPtr[0] & 31) / (float)31;
            break;
        }

        case CL_UNORM_SHORT_555: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            tempData[0] = (float)((dPtr[0] >> 10) & 31) / (float)31;
            tempData[1] = (float)((dPtr[0] >> 5) & 31) / (float)31;
            tempData[2] = (float)(dPtr[0] & 31) / (float)31;
            break;
        }

        case CL_UNORM_INT_101010: {
            cl_uint *dPtr = (cl_uint *)ptr;
            tempData[0] = (float)((dPtr[0] >> 20) & 0x3ff) / (float)1023;
            tempData[1] = (float)((dPtr[0] >> 10) & 0x3ff) / (float)1023;
            tempData[2] = (float)(dPtr[0] & 0x3ff) / (float)1023;
            break;
        }

        case CL_FLOAT: {
            float *dPtr = (float *)ptr;
            for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
            break;
        }
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE: {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            for (i = 0; i < channelCount; i++)
                tempData[i] = ((int)dPtr[i] - 16384) * 0x1.0p-14f;
            break;
        }
#endif
    }


    outData[0] = outData[1] = outData[2] = 0;
    outData[3] = 1;

    switch (format->image_channel_order)
    {
        case CL_A: outData[3] = tempData[0]; break;
        case CL_R:
        case CL_Rx: outData[0] = tempData[0]; break;
        case CL_RA:
            outData[0] = tempData[0];
            outData[3] = tempData[1];
            break;
        case CL_RG:
        case CL_RGx:
            outData[0] = tempData[0];
            outData[1] = tempData[1];
            break;
        case CL_RGB:
        case CL_RGBx:
        case CL_sRGB:
        case CL_sRGBx:
            outData[0] = tempData[0];
            outData[1] = tempData[1];
            outData[2] = tempData[2];
            break;
        case CL_RGBA:
            outData[0] = tempData[0];
            outData[1] = tempData[1];
            outData[2] = tempData[2];
            outData[3] = tempData[3];
            break;
        case CL_ARGB:
            outData[0] = tempData[1];
            outData[1] = tempData[2];
            outData[2] = tempData[3];
            outData[3] = tempData[0];
            break;
        case CL_ABGR:
            outData[0] = tempData[3];
            outData[1] = tempData[2];
            outData[2] = tempData[1];
            outData[3] = tempData[0];
            break;
        case CL_BGRA:
        case CL_sBGRA:
            outData[0] = tempData[2];
            outData[1] = tempData[1];
            outData[2] = tempData[0];
            outData[3] = tempData[3];
            break;
        case CL_INTENSITY:
            outData[0] = tempData[0];
            outData[1] = tempData[0];
            outData[2] = tempData[0];
            outData[3] = tempData[0];
            break;
        case CL_LUMINANCE:
            outData[0] = tempData[0];
            outData[1] = tempData[0];
            outData[2] = tempData[0];
            break;
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE:
            outData[0] = tempData[1];
            outData[1] = tempData[2];
            outData[2] = tempData[3];
            outData[3] = 1.0f;
            break;
#endif
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE:
            outData[0] = tempData[2];
            outData[1] = tempData[1];
            outData[2] = tempData[0];
            outData[3] = 1.0f;
            break;
#endif
        case CL_sRGBA:
            outData[0] = tempData[0];
            outData[1] = tempData[1];
            outData[2] = tempData[2];
            outData[3] = tempData[3];
            break;
        case CL_DEPTH: outData[0] = tempData[0]; break;
        default:
            log_error("Invalid format:");
            print_header(format, true);
            break;
    }
}
#endif

void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
                            int y, int z, float *outData)
{
    read_image_pixel_float(imageData, imageInfo, x, y, z, outData, 0);
}

bool get_integer_coords(float x, float y, float z, size_t width, size_t height,
                        size_t depth, image_sampler_data *imageSampler,
                        image_descriptor *imageInfo, int &outX, int &outY,
                        int &outZ)
{
    return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height,
                                     depth, imageSampler, imageInfo, outX, outY,
                                     outZ);
}

bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset,
                               float yAddressOffset, float zAddressOffset,
                               size_t width, size_t height, size_t depth,
                               image_sampler_data *imageSampler,
                               image_descriptor *imageInfo, int &outX,
                               int &outY, int &outZ)
{
    AddressFn adFn = sAddressingTable[imageSampler];

#ifdef __SSE2__
    __m128 coord = _mm_set_ps(0.f, z, y, x);
    __m128 addressOffset =
        _mm_set_ps(0.f, zAddressOffset, yAddressOffset, xAddressOffset);
    __m128i extent = _mm_set_epi32(0, depth, height, width);
    __m128 extentf = _mm_cvtepi32_ps(extent);
    __m128i ref = vifloorf(coord);

    __m128 arrayMask;
    switch (imageInfo->type)
    {
        case CL_MEM_OBJECT_IMAGE1D:
        case CL_MEM_OBJECT_IMAGE1D_BUFFER:
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            arrayMask = _mm_castsi128_ps(
                _mm_bsrli_si128(_mm_setmone_si128(), 12)); // = 0, 0, 0, -1
            break;
        case CL_MEM_OBJECT_IMAGE2D:
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            arrayMask = _mm_castsi128_ps(
                _mm_bsrli_si128(_mm_setmone_si128(), 8)); // = 0, 0, -1, -1
            break;
        default:
            arrayMask = _mm_castsi128_ps(
                _mm_bsrli_si128(_mm_setmone_si128(), 4)); // = 0, -1, -1, -1
    }
    __m128 locMask =
        _mm_andnot_ps(_mm_cmpeq_ps(extentf, _mm_setzero_ps()), arrayMask);
    __m128 offsetMask =
        _mm_andnot_ps(_mm_cmpeq_ps(addressOffset, _mm_setzero_ps()), arrayMask);

    // Handle sampler-directed coordinate normalization + clamping.  Note that
    // the array coordinate for image array types is expected to be
    // unnormalized, and is clamped to 0..arraySize-1.
    if (imageSampler->normalized_coords)
    {
        __m128 minMask, maxMask, temp;
        switch (imageSampler->addressing_mode)
        {
            case CL_ADDRESS_REPEAT:
                coord = SELECT_F(
                    locMask, RepeatNormalizedAddressFn(coord, extent), coord);

                // Add in the offset
                coord = _mm_add_ps(coord, addressOffset);
                // Handle wrapping
                minMask = _mm_andnot_ps(_mm_cmplt_ps(coord, _mm_setzero_ps()),
                                        offsetMask);
                maxMask =
                    _mm_andnot_ps(_mm_cmpgt_ps(coord, extentf), offsetMask);
                coord = SELECT_F(
                    minMask, _mm_add_ps(coord, extentf),
                    SELECT_F(maxMask, _mm_sub_ps(coord, extentf), coord));
                break;

            case CL_ADDRESS_MIRRORED_REPEAT:
                coord = SELECT_F(
                    locMask, MirroredRepeatNormalizedAddressFn(coord, extent),
                    coord);
                temp = _mm_add_ps(coord, addressOffset);
                maxMask = _mm_cmpgt_ps(temp, extentf);
                coord = SELECT_F(
                    offsetMask,
                    vfabsf(SELECT_F(
                        maxMask, _mm_sub_ps(extentf, _mm_sub_ps(temp, extentf)),
                        temp)),
                    coord);
                break;

            default:
                // Also, remultiply to the original coords. This simulates any
                // truncation in the pass to OpenCL
#ifdef __FMA4__
                coord =
                    SELECT_F(arrayMask,
                             _mm_macc_ps(coord, extentf, addressOffset), coord);
#elif defined(__FMA__)
                coord = SELECT_F(arrayMask,
                                 _mm_fmadd_ps(coord, extentf, addressOffset),
                                 coord);
#else
                coord = SELECT_F(
                    arrayMask,
                    _mm_add_ps(_mm_mul_ps(coord, extentf), addressOffset),
                    coord);
#endif
                break;
        }
    }

    // At this point, we're dealing with non-normalized coordinates.

    __m128i out =
        SELECT_I(_mm_castps_si128(locMask), adFn(vifloorf(coord), extent),
                 _mm_cvtps_epi32(coord));
    outX = _mm_cvtsi128_si32(out);

    // 1D and 2D arrays require special care for the index coordinate:

    switch (imageInfo->type)
    {
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            outY = static_cast<int>(
                calculate_array_index(y, (float)imageInfo->arraySize - 1.0f));
            outZ = 0; /* don't care! */
            break;
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            outY = EXTRACT_I(out, 1);
            outZ = static_cast<int>(
                calculate_array_index(z, (float)imageInfo->arraySize - 1.0f));
            break;
        default:
            // legacy path:
            outY = EXTRACT_I(out, 1);
            outZ = EXTRACT_I(out, 2);
    }

    out = _mm_set_epi32(0, outZ, outY, outX);
    __m128i refEqual = _mm_cmpeq_epi32(ref, out);
    return TEST_ANY_ZERO(refEqual);
#else
    float refX = floorf(x), refY = floorf(y), refZ = floorf(z);

    // Handle sampler-directed coordinate normalization + clamping.  Note that
    // the array coordinate for image array types is expected to be
    // unnormalized, and is clamped to 0..arraySize-1.
    if (imageSampler->normalized_coords)
    {
        switch (imageSampler->addressing_mode)
        {
            case CL_ADDRESS_REPEAT:
                x = RepeatNormalizedAddressFn(x, width);
                if (height != 0)
                {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                        y = RepeatNormalizedAddressFn(y, height);
                }
                if (depth != 0)
                {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
                        z = RepeatNormalizedAddressFn(z, depth);
                }

                if (xAddressOffset != 0.0)
                {
                    // Add in the offset
                    x += xAddressOffset;
                    // Handle wrapping
                    if (x > width) x -= (float)width;
                    if (x < 0) x += (float)width;
                }
                if ((yAddressOffset != 0.0)
                    && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY))
                {
                    // Add in the offset
                    y += yAddressOffset;
                    // Handle wrapping
                    if (y > height) y -= (float)height;
                    if (y < 0) y += (float)height;
                }
                if ((zAddressOffset != 0.0)
                    && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY))
                {
                    // Add in the offset
                    z += zAddressOffset;
                    // Handle wrapping
                    if (z > depth) z -= (float)depth;
                    if (z < 0) z += (float)depth;
                }
                break;

            case CL_ADDRESS_MIRRORED_REPEAT:
                x = MirroredRepeatNormalizedAddressFn(x, width);
                if (height != 0)
                {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                        y = MirroredRepeatNormalizedAddressFn(y, height);
                }
                if (depth != 0)
                {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
                        z = MirroredRepeatNormalizedAddressFn(z, depth);
                }

                if (xAddressOffset != 0.0)
                {
                    float temp = x + xAddressOffset;
                    if (temp > (float)width)
                        temp = (float)width - (temp - (float)width);
                    x = fabsf(temp);
                }
                if ((yAddressOffset != 0.0)
                    && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY))
                {
                    float temp = y + yAddressOffset;
                    if (temp > (float)height)
                        temp = (float)height - (temp - (float)height);
                    y = fabsf(temp);
                }
                if ((zAddressOffset != 0.0)
                    && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY))
                {
                    float temp = z + zAddressOffset;
                    if (temp > (float)depth)
                        temp = (float)depth - (temp - (float)depth);
                    z = fabsf(temp);
                }
                break;

            default:
                // Also, remultiply to the original coords. This simulates any
                // truncation in the pass to OpenCL
                x *= (float)width;
                x += xAddressOffset;

                if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                {
                    y *= (float)height;
                    y += yAddressOffset;
                }

                if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
                {
                    z *= (float)depth;
                    z += zAddressOffset;
                }
                break;
        }
    }

    // At this point, we're dealing with non-normalized coordinates.

    outX = adFn(static_cast<int>(floorf(x)), width);

    // 1D and 2D arrays require special care for the index coordinate:

    switch (imageInfo->type)
    {
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            outY = static_cast<int>(
                calculate_array_index(y, (float)imageInfo->arraySize - 1.0f));
            outZ = 0; /* don't care! */
            break;
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            outY = adFn(static_cast<int>(floorf(y)), height);
            outZ = static_cast<int>(
                calculate_array_index(z, (float)imageInfo->arraySize - 1.0f));
            break;
        default:
            // legacy path:
            if (height != 0) outY = adFn(static_cast<int>(floorf(y)), height);
            if (depth != 0) outZ = adFn(static_cast<int>(floorf(z)), depth);
    }

    return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ);
#endif
}

#ifdef __SSE2__
static inline __m128 pixelMax(__m128 a, __m128 b)
{
    // n.b. Operands must be reversed, because if either one is NaN, the second
    // operand is returned
    return _mm_max_ps(vfabsf(b), vfabsf(a));
}

static inline int IsFloatSubnormal(__m128 x)
{
    // No fpclass until AVX-512 (what took them so long?!!)
    union {
        __m128 f;
        __m128i u;
    } u;
    u.f = vfabsf(x);
    __m128i negOne = _mm_setmone_si128();
    return TEST_NONZERO(
        _mm_cmplt_epi32(_mm_add_epi32(u.u, negOne),
                        _mm_srli_epi32(negOne, 9) /* 0x007fffff */));
}

// If containsDenorms is NULL, flush denorms to zero
// if containsDenorms is not NULL, record whether there are any denorms
static inline __m128 check_for_denorms(__m128 a, int *containsDenorms)
{
    if (NULL != containsDenorms && IsFloatSubnormal(a)) *containsDenorms = 1;
    return a;
}
#else
static inline void pixelMax(const float a[4], const float b[4], float *results);
static inline void pixelMax(const float a[4], const float b[4], float *results)
{
    for (int i = 0; i < 4; i++) results[i] = errMax(fabsf(a[i]), fabsf(b[i]));
}

// If containsDenorms is NULL, flush denorms to zero
// if containsDenorms is not NULL, record whether there are any denorms
static inline void check_for_denorms(float a[4], int *containsDenorms);
static inline void check_for_denorms(float a[4], int *containsDenorms)
{
    if (NULL == containsDenorms)
    {
        for (int i = 0; i < 4; i++)
        {
            if (IsFloatSubnormal(a[i])) a[i] = copysignf(0.0f, a[i]);
        }
    }
    else
    {
        for (int i = 0; i < 4; i++)
        {
            if (IsFloatSubnormal(a[i]))
            {
                *containsDenorms = 1;
                break;
            }
        }
    }
}
#endif

inline float calculate_array_index(float coord, float extent)
{
    // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array'
    //
    // given coordinate 'w' that represents an index:
    // layer_index = clamp( rint(w), 0, image_array_size - 1)

    float ret = rintf(coord);
    ret = ret > extent ? extent : ret;
    ret = ret < 0.0f ? 0.0f : ret;

    return ret;
}

#ifdef __SSE2__
#define EXTRACT_F(v, i)                                                        \
    _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 1, i)))
#endif

/*
 * Utility function to unnormalized a coordinate given a particular sampler.
 *
 * name     - the name of the coordinate, used for verbose debugging only
 * coord    - the coordinate requiring unnormalization
 * offset   - an addressing offset to be added to the coordinate
 * extent   - the max value for this coordinate (e.g. width for x)
 */
#ifdef __SSE2__

#ifdef __AVX__
#define TEST_NONZERO_F(v) !_mm_testz_ps(v, v)
#else
#define TEST_NONZERO_F(v) TEST_NONZERO(_mm_castps_si128(v))
#endif

static __m128 unnormalize_coordinate(const char *name, __m128 coord,
                                     __m128 offset, __m128 extent,
                                     cl_addressing_mode addressing_mode,
                                     int verbose)
{
    __m128 zero = _mm_setzero_ps();
    __m128 ret = zero;
    __m128 offsetMask = _mm_cmpneq_ps(offset, zero);
    __m128 minMask, maxMask, temp;

    switch (addressing_mode)
    {
        case CL_ADDRESS_REPEAT:
            ret = RepeatNormalizedAddressFn(coord, _mm_cvtps_epi32(extent));

            if (verbose)
            {
                log_info("\tRepeat filter denormalizes %s (%f, %f, %f) to %f, "
                         "%f, %f\n",
                         name, EXTRACT_F(coord, 0), EXTRACT_F(coord, 1),
                         EXTRACT_F(coord, 2), EXTRACT_F(ret, 0),
                         EXTRACT_F(ret, 1), EXTRACT_F(ret, 2));
            }

            // Add in the offset, and handle wrapping.
            ret = _mm_add_ps(ret, offset);
            maxMask = _mm_and_ps(offsetMask, _mm_cmpgt_ps(ret, extent));
            minMask = _mm_and_ps(offsetMask, _mm_cmplt_ps(ret, zero));
            ret = SELECT_F(minMask, _mm_add_ps(ret, extent),
                           SELECT_F(maxMask, _mm_sub_ps(ret, extent), ret));

            if (verbose && TEST_NONZERO_F(offsetMask))
            {
                log_info(
                    "\tAddress offset of %f, %f, %f added to get %f, %f, %f\n",
                    EXTRACT_F(offset, 0), EXTRACT_F(offset, 1),
                    EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1),
                    EXTRACT_F(ret, 2));
            }
            break;

        case CL_ADDRESS_MIRRORED_REPEAT:
            ret = MirroredRepeatNormalizedAddressFn(coord,
                                                    _mm_cvtps_epi32(extent));

            if (verbose)
            {
                log_info("\tMirrored repeat filter denormalizes %s (%f, %f, "
                         "%f) to %f, %f, %f\n",
                         name, EXTRACT_F(coord, 0), EXTRACT_F(coord, 1),
                         EXTRACT_F(coord, 2), EXTRACT_F(ret, 0),
                         EXTRACT_F(ret, 1), EXTRACT_F(ret, 2));
            }

            temp = _mm_add_ps(ret, offset);
            maxMask = _mm_cmpgt_ps(temp, extent);
            ret = SELECT_F(
                offsetMask,
                vfabsf(SELECT_F(maxMask,
                                _mm_sub_ps(extent, _mm_sub_ps(temp, extent)),
                                temp)),
                ret);

            if (verbose && TEST_NONZERO_F(offsetMask))
            {
                log_info(
                    "\tAddress offset of %f, %f, %f added to get %f, %f, %f\n",
                    EXTRACT_F(offset, 0), EXTRACT_F(offset, 1),
                    EXTRACT_F(offset, 2), EXTRACT_F(ret, 0), EXTRACT_F(ret, 1),
                    EXTRACT_F(ret, 2));
            }
            break;

        default:

            if (verbose)
            {
                ret = _mm_mul_ps(coord, extent);
                log_info("\tFilter denormalizes %s to %f, %f, %f (<%f, %f, %f> "
                         "* <%f, %f, %f>)\n",
                         name, EXTRACT_F(ret, 0), EXTRACT_F(ret, 1),
                         EXTRACT_F(ret, 2), EXTRACT_F(coord, 0),
                         EXTRACT_F(coord, 1), EXTRACT_F(coord, 2),
                         EXTRACT_F(extent, 0), EXTRACT_F(extent, 1),
                         EXTRACT_F(extent, 2));
                if (TEST_NONZERO_F(offsetMask))
                {
                    ret = _mm_add_ps(ret, offset);
                    log_info("\tAddress offset of %f, %f, %f added to get %f, "
                             "%f, %f\n",
                             EXTRACT_F(offset, 0), EXTRACT_F(offset, 1),
                             EXTRACT_F(offset, 2), EXTRACT_F(ret, 0),
                             EXTRACT_F(ret, 1), EXTRACT_F(ret, 2));
                }
            }
            else
            {
#ifdef __FMA4__
                ret = _mm_macc_ps(coord, extent, offset);
#elif defined(__FMA__)
                ret = _mm_fmadd_ps(coord, extent, offset);
#else
                ret = _mm_add_ps(_mm_mul_ps(coord, extent), offset);
#endif
            }
    }

    return ret;
}
#else
static float unnormalize_coordinate(const char *name, float coord, float offset,
                                    float extent,
                                    cl_addressing_mode addressing_mode,
                                    int verbose)
{
    float ret = 0.0f;

    switch (addressing_mode)
    {
        case CL_ADDRESS_REPEAT:
            ret = RepeatNormalizedAddressFn(coord, static_cast<size_t>(extent));

            if (verbose)
            {
                log_info("\tRepeat filter denormalizes %s (%f) to %f\n", name,
                         coord, ret);
            }

            if (offset != 0.0)
            {
                // Add in the offset, and handle wrapping.
                ret += offset;
                if (ret > extent) ret -= extent;
                if (ret < 0.0) ret += extent;
            }

            if (verbose && offset != 0.0f)
            {
                log_info("\tAddress offset of %f added to get %f\n", offset,
                         ret);
            }
            break;

        case CL_ADDRESS_MIRRORED_REPEAT:
            ret = MirroredRepeatNormalizedAddressFn(
                coord, static_cast<size_t>(extent));

            if (verbose)
            {
                log_info(
                    "\tMirrored repeat filter denormalizes %s (%f) to %f\n",
                    name, coord, ret);
            }

            if (offset != 0.0)
            {
                float temp = ret + offset;
                if (temp > extent) temp = extent - (temp - extent);
                ret = fabsf(temp);
            }

            if (verbose && offset != 0.0f)
            {
                log_info("\tAddress offset of %f added to get %f\n", offset,
                         ret);
            }
            break;

        default:

            ret = coord * extent;

            if (verbose)
            {
                log_info("\tFilter denormalizes %s to %f (%f * %f)\n", name,
                         ret, coord, extent);
            }

            ret += offset;

            if (verbose && offset != 0.0f)
            {
                log_info("\tAddress offset of %f added to get %f\n", offset,
                         ret);
            }
    }

    return ret;
}
#endif

FloatPixel
sample_image_pixel_float(void *imageData, image_descriptor *imageInfo, float x,
                         float y, float z, image_sampler_data *imageSampler,
                         float *outData, int verbose, int *containsDenorms)
{
    return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f,
                                           0.0f, 0.0f, imageSampler, outData,
                                           verbose, containsDenorms);
}

// returns max pixel value of the pixels touched
FloatPixel sample_image_pixel_float(void *imageData,
                                    image_descriptor *imageInfo, float x,
                                    float y, float z,
                                    image_sampler_data *imageSampler,
                                    float *outData, int verbose,
                                    int *containsDenorms, int lod)
{
    return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f,
                                           0.0f, 0.0f, imageSampler, outData,
                                           verbose, containsDenorms, lod);
}
#ifdef __SSE2__
FloatPixel sample_image_pixel_float_offset(
    void *imageData, image_descriptor *imageInfo, float x, float y, float z,
    float xAddressOffset, float yAddressOffset, float zAddressOffset,
    image_sampler_data *imageSampler, float *outData, int verbose,
    int *containsDenorms, int lod)
{
    AddressFn adFn = sAddressingTable[imageSampler];
    FloatPixel returnVal;
    size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
           depth_lod = imageInfo->depth;
    size_t slice_pitch_lod = 0, row_pitch_lod = 0;

    if (imageInfo->num_mip_levels > 1)
    {
        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE3D:
                depth_lod =
                    (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                height_lod =
                    (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            default:
                width_lod =
                    (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
        }
        row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
        if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
            slice_pitch_lod = row_pitch_lod;
        else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
                 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
            slice_pitch_lod = row_pitch_lod * height_lod;
    }
    else
    {
        slice_pitch_lod = imageInfo->slicePitch;
        row_pitch_lod = imageInfo->rowPitch;
    }

    if (containsDenorms) *containsDenorms = 0;

    __m128 coord, addressOffset, extentf;
    __m128i extent, addressMask;
    switch (imageInfo->type)
    {

            // The image array types require special care:

        case CL_MEM_OBJECT_IMAGE1D:
        case CL_MEM_OBJECT_IMAGE1D_BUFFER:
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            coord = _mm_set_ss(x);
            addressOffset = _mm_set_ss(xAddressOffset);
            extent = _mm_set_epi32(0, 1, 1, width_lod);
            addressMask =
                _mm_bsrli_si128(_mm_setmone_si128(), 12); // = 0, 0, 0, -1
            break;

        case CL_MEM_OBJECT_IMAGE2D:
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            coord = _mm_set_ps(0.f, 0.f, y, x);
            addressOffset =
                _mm_set_ps(0.f, 0.f, yAddressOffset, xAddressOffset);
            extent = _mm_set_epi32(0, 1, height_lod, width_lod);
            addressMask =
                _mm_bsrli_si128(_mm_setmone_si128(), 8); // = 0, 0, -1, -1
            break;

            // Everybody else:

        default:
            coord = _mm_set_ps(0.f, z, y, x);
            addressOffset =
                _mm_set_ps(0.f, zAddressOffset, yAddressOffset, xAddressOffset);
            extent = _mm_set_epi32(0, depth_lod, height_lod, width_lod);
            addressMask =
                _mm_bsrli_si128(_mm_setmone_si128(), 4); // = 0, -1, -1, -1
    }
    extentf = _mm_cvtepi32_ps(extent);

    if (imageSampler->normalized_coords)
    {
        // We need to unnormalize our coordinates differently depending on
        // the image type, but 'x' is always processed the same way.

        const char *name = NULL;
        switch (imageInfo->type)
        {

            case CL_MEM_OBJECT_IMAGE1D:
            case CL_MEM_OBJECT_IMAGE1D_BUFFER:
            case CL_MEM_OBJECT_IMAGE1D_ARRAY: name = "x"; break;

            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY: name = "x, y"; break;

            default: name = "x, y, z";
        }

        coord = unnormalize_coordinate(name, coord, addressOffset, extentf,
                                       imageSampler->addressing_mode, verbose);
    }
    else if (verbose)
    {

        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                log_info("Starting coordinate: %f, array index %f\n", x, y);
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                log_info("Starting coordinate: %f, %f, array index %f\n", x, y,
                         z);
                break;
            case CL_MEM_OBJECT_IMAGE1D:
            case CL_MEM_OBJECT_IMAGE1D_BUFFER:
                log_info("Starting coordinate: %f\b", x);
                break;
            case CL_MEM_OBJECT_IMAGE2D:
                log_info("Starting coordinate: %f, %f\n", x, y);
                break;
            case CL_MEM_OBJECT_IMAGE3D:
            default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z);
        }
    }

    // At this point, we have unnormalized coordinates.

    if (imageSampler->filter_mode == CL_FILTER_NEAREST)
    {
        __m128i icoord;
        int arrayIndex;

        // We apply the addressing function to the now-unnormalized
        // coordinates.  Note that the array cases again require special
        // care, per section 8.4 in the OpenCL 1.2 Specification.

        icoord = _mm_and_si128(addressMask, adFn(vifloorf(coord), extent));

        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                arrayIndex = static_cast<int>(calculate_array_index(
                    y, (float)(imageInfo->arraySize - 1)));
                if (verbose)
                    log_info("\tArray index %f evaluates to %d\n", y,
                             arrayIndex);
#ifdef __SSE4_1__
                icoord = _mm_insert_epi32(icoord, arrayIndex, 1);
#else
                icoord = _mm_insert_epi16(
                    _mm_insert_epi16(icoord, (short)arrayIndex, 2),
                    (short)(arrayIndex >> 16), 3);
#endif
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                arrayIndex = static_cast<int>(calculate_array_index(
                    z, (float)(imageInfo->arraySize - 1)));
                if (verbose)
                    log_info("\tArray index %f evaluates to %d\n", z,
                             arrayIndex);
#ifdef __SSE4_1__
                icoord = _mm_insert_epi32(icoord, arrayIndex, 2);
#else
                icoord = _mm_insert_epi16(
                    _mm_insert_epi16(icoord, (short)arrayIndex, 4),
                    (short)(arrayIndex >> 16), 5);
#endif
                break;
            default: break;
        }

        if (verbose)
        {
            if (depth_lod)
                log_info(
                    "\tReference integer coords calculated: { %d, %d, %d }\n",
                    EXTRACT_I(icoord, 0), EXTRACT_I(icoord, 1),
                    EXTRACT_I(icoord, 2));
            else
                log_info("\tReference integer coords calculated: { %d, %d }\n",
                         EXTRACT_I(icoord, 0), EXTRACT_I(icoord, 1));
        }

        // SSE has an FTZ mode that will be useful here
        unsigned int mxcsr = 0;
        if (NULL == containsDenorms)
        {
            mxcsr = _mm_getcsr();
            _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON);
        }
        __m128 outPixel = check_for_denorms(
            read_image_pixel_float(imageData, imageInfo, icoord, lod),
            containsDenorms);
        if (NULL == containsDenorms) _mm_setcsr(mxcsr);
        _mm_storeu_ps(outData, outPixel);
        _mm_storeu_ps(returnVal.p, vfabsf(outPixel));
        return returnVal;
    }
    else
    {
        // Linear filtering cases.

        // Image arrays can use 2D filtering, but require us to walk into the
        // image a certain number of slices before reading.

        if (depth_lod == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY
            || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
        {
            float array_index = 0;

            size_t layer_offset = 0;

            if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
            {
                array_index =
                    calculate_array_index(z, (float)(imageInfo->arraySize - 1));
                layer_offset = slice_pitch_lod * (size_t)array_index;
            }
            else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
            {
                array_index =
                    calculate_array_index(y, (float)(imageInfo->arraySize - 1));
                layer_offset = slice_pitch_lod * (size_t)array_index;
            }

            __m128i icoord =
                vifloorf(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.f, 0.5f, 0.5f)));
            __m128i coord00 = _mm_and_si128(addressMask, adFn(icoord, extent));
            __m128i coord11 = _mm_and_si128(
                addressMask,
                adFn(_mm_sub_epi32(icoord, _mm_setmone_si128()), extent));

            if (verbose)
            {
                log_info("\tActual integer coords used (i = floor(x-.5)): i0:{ "
                         "%d, %d } and i1:{ %d, %d }\n",
                         EXTRACT_I(coord00, 0), EXTRACT_I(coord00, 1),
                         EXTRACT_I(coord11, 0), EXTRACT_I(coord11, 1));
                log_info("\tArray coordinate is %f\n", array_index);
            }

            // Walk to beginning of the 'correct' slice, if needed.
            char *imgPtr = ((char *)imageData) + layer_offset;

            // flush subnormal results to zero if necessary
            // SSE has an FTZ mode that will be useful here
            unsigned int mxcsr;
            if (NULL == containsDenorms)
            {
                mxcsr = _mm_getcsr();
                _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON);
            }

            // Make coordinate vectors for pixels 01 and 10
#ifdef __AVX2__
            __m128i coord01 = _mm_blend_epi32(coord00, coord11, 0x01);
            __m128i coord10 = _mm_blend_epi32(coord00, coord11, 0x02);
#elif defined(__SSE4_1__)
            __m128i coord01 = _mm_blend_epi16(coord00, coord11, 0x03);
            __m128i coord10 = _mm_blend_epi16(coord00, coord11, 0x0C);
#else
            __m128i coordMask =
                _mm_bsrli_si128(_mm_setmone_si128(), 8); // = 0, 0, -1, -1
            __m128i interleavedCoord =
                _mm_unpacklo_epi32(coord00, coord11); // y2 y1 x2 x1
            __m128i coord01 = _mm_and_si128(
                coordMask,
                _mm_shuffle_epi32(interleavedCoord, _MM_SHUFFLE(3, 0, 2, 1)));
            __m128i coord10 = _mm_and_si128(
                coordMask,
                _mm_shuffle_epi32(interleavedCoord, _MM_SHUFFLE(2, 1, 3, 0)));
#endif

            __m128 upLeft, upRight, lowLeft, lowRight;
            __m128 maxUp, maxLow;
            upLeft = check_for_denorms(
                read_image_pixel_float(imgPtr, imageInfo, coord00, lod),
                containsDenorms);
            upRight = check_for_denorms(
                read_image_pixel_float(imgPtr, imageInfo, coord01, lod),
                containsDenorms);
            maxUp = pixelMax(upLeft, upRight);
            lowLeft = check_for_denorms(
                read_image_pixel_float(imgPtr, imageInfo, coord10, lod),
                containsDenorms);
            lowRight = check_for_denorms(
                read_image_pixel_float(imgPtr, imageInfo, coord11, lod),
                containsDenorms);
            maxLow = pixelMax(lowLeft, lowRight);
            _mm_storeu_ps(returnVal.p, pixelMax(maxUp, maxLow));

            if (verbose)
            {
                if (NULL == containsDenorms)
                    log_info("\tSampled pixels (rgba order, denorms flushed to "
                             "zero):\n");
                else
                    log_info("\tSampled pixels (rgba order):\n");
                log_info("\t\tp00: %f, %f, %f, %f\n", EXTRACT_F(upLeft, 0),
                         EXTRACT_F(upLeft, 1), EXTRACT_F(upLeft, 2),
                         EXTRACT_F(upLeft, 3));
                log_info("\t\tp01: %f, %f, %f, %f\n", EXTRACT_F(upRight, 0),
                         EXTRACT_F(upRight, 1), EXTRACT_F(upRight, 2),
                         EXTRACT_F(upRight, 3));
                log_info("\t\tp10: %f, %f, %f, %f\n", EXTRACT_F(lowLeft, 0),
                         EXTRACT_F(lowLeft, 1), EXTRACT_F(lowLeft, 2),
                         EXTRACT_F(lowLeft, 3));
                log_info("\t\tp11: %f, %f, %f, %f\n", EXTRACT_F(lowRight, 0),
                         EXTRACT_F(lowRight, 1), EXTRACT_F(lowRight, 2),
                         EXTRACT_F(lowRight, 3));
            }

            __m128 fracCoord = frac(_mm_sub_ps(coord, _mm_set1_ps(0.5f)));
            if (verbose)
                log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f\n",
                         EXTRACT_F(fracCoord, 0), EXTRACT_F(fracCoord, 1));

#ifdef __AVX__
            __m256d alphaBeta = _mm256_cvtps_pd(fracCoord); // x x b a
            alphaBeta = _mm256_insertf128_pd(
                alphaBeta,
                _mm_sub_pd(_mm_set1_pd(1.0), _mm256_castpd256_pd128(alphaBeta)),
                1); // 1-b 1-a b a
            //   1-b       1-a       b         a
            //   1-a        b        a        1-b (2 1 0 3)
            //(1-a)(1-b) (1-a)b     a*b      a(1-b)
            //    00        10      11         01
#ifdef __AVX2__
            __m256d weights = _mm256_mul_pd(
                alphaBeta,
                _mm256_permute4x64_pd(alphaBeta, _MM_SHUFFLE(2, 1, 0, 3)));
            __m256d weight01 =
                _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights));
            __m256d weight11 =
                _mm256_permute4x64_pd(weights, _MM_SHUFFLE(1, 1, 1, 1));
            __m256d weight10 =
                _mm256_permute4x64_pd(weights, _MM_SHUFFLE(2, 2, 2, 2));
            __m256d weight00 =
                _mm256_permute4x64_pd(weights, _MM_SHUFFLE(3, 3, 3, 3));
#else
            // This is now more complicated...
            // Swap the two halves of the vector (1 0 3 2):
            //  0x01 = 0b00000001
            //             ~~  ~~
            //       +-----+    |
            //   +---*----------+
            //   |   +
            //   v   v
            //  +-+ +-+
            //  3 2 1 0
            // ...then shuffle the elements as follows:
            //  0x05 = 0b00000101
            //    +----------+|||
            //    | +---------+||
            //    | |     +----+|
            //    | |     | +---+
            //    v v     v v
            //  3 2 1 0 1 0 3 2
            __m256d weights = _mm256_mul_pd(
                alphaBeta,
                _mm256_shuffle_pd(
                    alphaBeta,
                    _mm256_permute2f128_pd(alphaBeta, alphaBeta, 0x01), 0x05));
            // Duplicate the even and odd elements...
            __m256d weight1001 = _mm256_movedup_pd(weights);
            __m256d weight0011 = _mm256_permute_pd(weights, 0x0F);
            // ...then duplicate the low and high halves of the results
            __m256d weight01 =
                _mm256_permute2f128_pd(_mm256_undefined_pd(), weight1001, 0x22);
            __m256d weight11 =
                _mm256_permute2f128_pd(_mm256_undefined_pd(), weight0011, 0x22);
            __m256d weight10 =
                _mm256_permute2f128_pd(_mm256_undefined_pd(), weight1001, 0x33);
            __m256d weight00 =
                _mm256_permute2f128_pd(_mm256_undefined_pd(), weight0011, 0x33);
#endif
#ifdef __FMA4__
            // Doing it this way instead of using a chain of FMAs avoids stalls
            _mm_storeu_ps(
                outData,
                _mm256_cvtpd_ps(_mm256_add_pd(
                    _mm256_macc_pd(
                        _mm256_cvtps_pd(upLeft), weight00,
                        _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)),
                    _mm256_macc_pd(
                        _mm256_cvtps_pd(lowLeft), weight10,
                        _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11)))));
#elif defined(__FMA__)
            _mm_storeu_ps(
                outData,
                _mm256_cvtpd_ps(_mm256_add_pd(
                    _mm256_fmadd_pd(
                        _mm256_cvtps_pd(upLeft), weight00,
                        _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)),
                    _mm256_fmadd_pd(
                        _mm256_cvtps_pd(lowLeft), weight10,
                        _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11)))));
#else
            // No VDPPD for 256-bit vectors... :/
            _mm_storeu_ps(
                outData,
                _mm256_cvtpd_ps(_mm256_add_pd(
                    _mm256_add_pd(
                        _mm256_mul_pd(_mm256_cvtps_pd(upLeft), weight00),
                        _mm256_mul_pd(_mm256_cvtps_pd(upRight), weight01)),
                    _mm256_add_pd(
                        _mm256_mul_pd(_mm256_cvtps_pd(lowLeft), weight10),
                        _mm256_mul_pd(_mm256_cvtps_pd(lowRight), weight11)))));
#endif
#else
            __m128d alphaBeta = _mm_cvtps_pd(fracCoord); // b a
            __m128d invAlphaBeta =
                _mm_sub_pd(_mm_set1_pd(1.0), alphaBeta); // 1-b 1-a
            __m128d weights[2];

            // <1-a b> * <1-b 1-a> = <(1-a)(1-b) (1-a)b> <00 10>
            weights[0] = _mm_mul_pd(
                _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(1, 0)),
                invAlphaBeta);
            // <b a> * <a 1-b> = <a*b a(1-b)> <11 01>
            weights[1] = _mm_mul_pd(
                alphaBeta,
                _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(0, 1)));

            __m128d upLeftL = _mm_cvtps_pd(upLeft);
            __m128d upLeftH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeft));
            __m128d upRightL = _mm_cvtps_pd(upRight);
            __m128d upRightH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRight));
            __m128d lowLeftL = _mm_cvtps_pd(lowLeft);
            __m128d lowLeftH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeft));
            __m128d lowRightL = _mm_cvtps_pd(lowRight);
            __m128d lowRightH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRight));
#ifdef __SSE4_1__
            // In the immediate bytes, the high nibble determines which
            // multiplies take place--in this case, both of them--and the low
            // nibble determines which lines receive the sum--bit 0 for the low
            // lane, bit 1 for the high.
            __m128d rg = _mm_or_pd(
                _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftL, upLeftL), 0x31),
                _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftL, upLeftL),
                          0x32));
            __m128d ba = _mm_or_pd(
                _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftH, upLeftH), 0x31),
                _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftH, upLeftH),
                          0x32));
            rg = _mm_add_pd(
                rg,
                _mm_or_pd(_mm_dp_pd(weights[1],
                                    _mm_unpacklo_pd(upRightL, lowRightL), 0x31),
                          _mm_dp_pd(weights[1],
                                    _mm_unpackhi_pd(upRightL, lowRightL),
                                    0x32)));
            ba = _mm_add_pd(
                ba,
                _mm_or_pd(_mm_dp_pd(weights[1],
                                    _mm_unpacklo_pd(upRightH, lowRightH), 0x31),
                          _mm_dp_pd(weights[1],
                                    _mm_unpackhi_pd(upRightH, lowRightH),
                                    0x32)));
#else
#ifdef __SSE3__
            __m128d weight10 = _mm_movedup_pd(weights[0]);
            __m128d weight01 = _mm_movedup_pd(weights[1]);
#else
            __m128d weight10 = _mm_unpacklo_pd(weights[0], weights[0]);
            __m128d weight01 = _mm_unpacklo_pd(weights[1], weights[1]);
#endif
            __m128d rg = _mm_add_pd(_mm_mul_pd(weight01, upRightL),
                                    _mm_mul_pd(weight10, lowLeftL));
            __m128d ba = _mm_add_pd(_mm_mul_pd(weight01, upRightH),
                                    _mm_mul_pd(weight10, lowLeftH));
            __m128d weight00 = _mm_unpackhi_pd(weights[0], weights[0]);
            __m128d weight11 = _mm_unpackhi_pd(weights[1], weights[1]);
            rg = _mm_add_pd(rg,
                            _mm_add_pd(_mm_mul_pd(weight00, upLeftL),
                                       _mm_mul_pd(weight11, lowRightL)));
            ba = _mm_add_pd(ba,
                            _mm_add_pd(_mm_mul_pd(weight00, upLeftH),
                                       _mm_mul_pd(weight11, lowRightH)));
#endif
            _mm_storeu_ps(outData,
                          _mm_movelh_ps(_mm_cvtpd_ps(rg), _mm_cvtpd_ps(ba)));
#endif
            if (NULL == containsDenorms) _mm_setcsr(mxcsr);
        }
        else
        {
            // 3D linear filtering
            __m128i icoord =
                vifloorf(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.5f, 0.5f, 0.5f)));
            __m128i coord000 = _mm_and_si128(addressMask, adFn(icoord, extent));
            __m128i coord111 = _mm_and_si128(
                addressMask,
                adFn(_mm_sub_epi32(icoord, _mm_setmone_si128()), extent));

            if (verbose)
                log_info("\tActual integer coords used (i = floor(x-.5)): "
                         "i0:{%d, %d, %d} and i1:{%d, %d, %d}\n",
                         EXTRACT_I(coord000, 0), EXTRACT_I(coord000, 1),
                         EXTRACT_I(coord000, 2), EXTRACT_I(coord111, 0),
                         EXTRACT_I(coord111, 1), EXTRACT_I(coord111, 2));

            // flush subnormal results to zero if necessary
            // SSE has an FTZ mode that will be useful here
            unsigned int mxcsr;
            if (NULL == containsDenorms)
            {
                mxcsr = _mm_getcsr();
                _mm_setcsr(mxcsr & ~_MM_FLUSH_ZERO_MASK | _MM_FLUSH_ZERO_ON);
            }

#ifdef __AVX2__
            __m128i coord001 = _mm_blend_epi32(coord000, coord111, 0x01);
            __m128i coord010 = _mm_blend_epi32(coord000, coord111, 0x02);
            __m128i coord011 = _mm_blend_epi32(coord000, coord111, 0x03);
            __m128i coord100 = _mm_blend_epi32(coord000, coord111, 0x04);
            __m128i coord101 = _mm_blend_epi32(coord000, coord111, 0x05);
            __m128i coord110 = _mm_blend_epi32(coord000, coord111, 0x06);
#elif defined(__SSE4_1__)
            __m128i coord001 = _mm_blend_epi16(coord000, coord111, 0x03);
            __m128i coord010 = _mm_blend_epi16(coord000, coord111, 0x0C);
            __m128i coord011 = _mm_blend_epi16(coord000, coord111, 0x0F);
            __m128i coord100 = _mm_blend_epi16(coord000, coord111, 0x30);
            __m128i coord101 = _mm_blend_epi16(coord000, coord111, 0x33);
            __m128i coord110 = _mm_blend_epi16(coord000, coord111, 0x3C);
#else
            // XXX This is horrible without PBLEND...
            __m128i negOne = _mm_setmone_si128();
            __m128i coordMask = _mm_bsrli_si128(negOne, 8); // = 0, 0, -1, -1
            __m128i coord011 = SELECT_I(coordMask, coord000, coord111);
            coordMask = _mm_bsrli_si128(coordMask, 4); // = 0, 0, 0, -1
            __m128i coord001 = SELECT_I(coordMask, coord000, coord111);
            coordMask = _mm_slli_epi64(coordMask, 32); // = 0, 0, -1, 0
            __m128i coord010 = SELECT_I(coordMask, coord000, coord111);
            coordMask = _mm_srli_epi64(negOne, 32); // = 0, -1, 0, -1
            __m128i coord101 = SELECT_I(coordMask, coord000, coord111);
            coordMask = _mm_bslli_si128(coordMask, 8); // = 0, -1, 0, 0
            __m128i coord100 = SELECT_I(coordMask, coord000, coord111);
            coordMask = _mm_bslli_si128(_mm_bsrli_si128(negOne, 4),
                                        4); // = 0, -1, -1, 0
            __m128i coord110 = SELECT_I(coordMask, coord000, coord111);
#endif

            __m128 upLeftA, upRightA, lowLeftA, lowRightA;
            __m128 upLeftB, upRightB, lowLeftB, lowRightB;
            __m128 pixelMaxA, pixelMaxB, pixelMaxC;
            upLeftA = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord000, lod),
                containsDenorms);
            upRightA = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord001, lod),
                containsDenorms);
            pixelMaxA = pixelMax(upLeftA, upRightA);
            lowLeftA = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord010, lod),
                containsDenorms);
            lowRightA = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord011, lod),
                containsDenorms);
            pixelMaxB = pixelMax(lowLeftA, lowRightA);
            pixelMaxC = pixelMax(pixelMaxA, pixelMaxB);
            upLeftB = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord100, lod),
                containsDenorms);
            upRightB = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord101, lod),
                containsDenorms);
            pixelMaxA = pixelMax(upLeftB, upRightB);
            lowLeftB = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord110, lod),
                containsDenorms);
            lowRightB = check_for_denorms(
                read_image_pixel_float(imageData, imageInfo, coord111, lod),
                containsDenorms);
            pixelMaxB = pixelMax(lowLeftB, lowRightB);
            pixelMaxA = pixelMax(pixelMaxA, pixelMaxB);
            _mm_storeu_ps(returnVal.p, pixelMax(pixelMaxA, pixelMaxC));

            if (verbose)
            {
                if (NULL == containsDenorms)
                    log_info("\tSampled pixels (rgba order, denorms flushed to "
                             "zero):\n");
                else
                    log_info("\tSampled pixels (rgba order):\n");
                log_info("\t\tp000: %f, %f, %f, %f\n", EXTRACT_F(upLeftA, 0),
                         EXTRACT_F(upLeftA, 1), EXTRACT_F(upLeftA, 2),
                         EXTRACT_F(upLeftA, 3));
                log_info("\t\tp001: %f, %f, %f, %f\n", EXTRACT_F(upRightA, 0),
                         EXTRACT_F(upRightA, 1), EXTRACT_F(upRightA, 2),
                         EXTRACT_F(upRightA, 3));
                log_info("\t\tp010: %f, %f, %f, %f\n", EXTRACT_F(lowLeftA, 0),
                         EXTRACT_F(lowLeftA, 1), EXTRACT_F(lowLeftA, 2),
                         EXTRACT_F(lowLeftA, 3));
                log_info("\t\tp011: %f, %f, %f, %f\n\n",
                         EXTRACT_F(lowRightA, 0), EXTRACT_F(lowRightA, 1),
                         EXTRACT_F(lowRightA, 2), EXTRACT_F(lowRightA, 3));
                log_info("\t\tp100: %f, %f, %f, %f\n", EXTRACT_F(upLeftB, 0),
                         EXTRACT_F(upLeftB, 1), EXTRACT_F(upLeftB, 2),
                         EXTRACT_F(upLeftB, 3));
                log_info("\t\tp101: %f, %f, %f, %f\n", EXTRACT_F(upRightB, 0),
                         EXTRACT_F(upRightB, 1), EXTRACT_F(upRightB, 2),
                         EXTRACT_F(upRightB, 3));
                log_info("\t\tp110: %f, %f, %f, %f\n", EXTRACT_F(lowLeftB, 0),
                         EXTRACT_F(lowLeftB, 1), EXTRACT_F(lowLeftB, 2),
                         EXTRACT_F(lowLeftB, 3));
                log_info("\t\tp111: %f, %f, %f, %f\n", EXTRACT_F(lowRightB, 0),
                         EXTRACT_F(lowRightB, 1), EXTRACT_F(lowRightB, 2),
                         EXTRACT_F(lowRightB, 3));
            }

            __m128 fracCoord =
                frac(_mm_sub_ps(coord, _mm_set_ps(0.f, 0.5f, 0.5f, 0.5f)));
            if (verbose)
                log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f, "
                         "frac( z - 0.5f ) = %f\n",
                         EXTRACT_F(fracCoord, 0), EXTRACT_F(fracCoord, 1),
                         EXTRACT_F(fracCoord, 2));

#ifdef __AVX__
            __m256d alphaBetaGamma = _mm256_cvtps_pd(fracCoord); // x g b a
            __m256d invABG = _mm256_sub_pd(_mm256_set1_pd(1.0),
                                           alphaBetaGamma); // x 1-g 1-b 1-a
            __m256d alphaBeta =
                _mm256_permute2f128_pd(alphaBetaGamma, invABG, 0x20);
            __m256d weights[2][2][2];
            //     1-g          1-g        1-g         1-g
            //     1-b          1-a         b           a
            //     1-a           b          a          1-b
            // a(1-b)(1-g)  (1-a)b(1-g)  a*b(1-g)  a(1-b)(1-g)
            //     000          010        011         001
            //      g           g       g       g
            //     1-b         1-a      b       a
            //     1-a          b       a      1-b
            // (1-a)(1-b)g  (1-a)b*g  a*b*g  a(1-b)g
            //     100         110     111     101
#ifdef __AVX2__
            __m256d invGamma =
                _mm256_permute4x64_pd(invABG, _MM_SHUFFLE(2, 2, 2, 2));
            weights[1][0][0] = _mm256_mul_pd(
                alphaBeta,
                _mm256_permute4x64_pd(alphaBeta, _MM_SHUFFLE(2, 1, 0, 3)));
            weights[0][0][0] = _mm256_mul_pd(weights[1][0][0], invGamma);
            __m256d gamma =
                _mm256_permute4x64_pd(alphaBetaGamma, _MM_SHUFFLE(2, 2, 2, 2));
            weights[1][0][0] = _mm256_mul_pd(weights[1][0][0], gamma);
            weights[0][0][1] =
                _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights[0][0][0]));
            weights[0][1][1] = _mm256_permute4x64_pd(weights[0][0][0],
                                                     _MM_SHUFFLE(1, 1, 1, 1));
            weights[1][0][1] =
                _mm256_broadcastsd_pd(_mm256_castpd256_pd128(weights[1][0][0]));
            weights[1][1][1] = _mm256_permute4x64_pd(weights[1][0][0],
                                                     _MM_SHUFFLE(1, 1, 1, 1));
            weights[0][1][0] = _mm256_permute4x64_pd(weights[0][0][0],
                                                     _MM_SHUFFLE(2, 2, 2, 2));
            weights[0][0][0] = _mm256_permute4x64_pd(weights[0][0][0],
                                                     _MM_SHUFFLE(3, 3, 3, 3));
            weights[1][1][0] = _mm256_permute4x64_pd(weights[1][0][0],
                                                     _MM_SHUFFLE(2, 2, 2, 2));
            weights[1][0][0] = _mm256_permute4x64_pd(weights[1][0][0],
                                                     _MM_SHUFFLE(3, 3, 3, 3));
#else
            // Much like before, we must permute the elements of alphaBeta to
            // get them in the form we need
            __m256d invGamma = _mm256_permute2f128(
                _mm256_undefined_pd(), _mm256_movedup_pd(invABG), 0x33);
            weights[1][0][0] = _mm_mul_pd(
                alphaBeta,
                _mm256_shuffle_pd(
                    alphaBeta,
                    _mm256_permute2f128_pd(alphaBeta, alphaBeta, 0x01), 0x05));
            weights[0][0][0] = _mm_mul_pd(weights[1], invGamma);
            __m256d gamma = _mm256_permute2f128(
                _mm256_undefined_pd(), _mm256_movedup_pd(alphaBetaGamma), 0x33);
            weights[1][0][0] = _mm_mul_pd(weights[1], gamma);
            weights[0][1][0] = _mm256_movedup_pd(weights[0][0][0]);
            weights[0][0][0] = _mm256_permute_pd(weights[0][0][0], 0x0F);
            weights[1][1][0] = _mm256_movedup_pd(weights[1][0][0]);
            weights[1][0][0] = _mm256_permute_pd(weights[1][0][0], 0x0F);
            weights[0][0][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[0][1][0], 0x22);
            weights[0][1][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[0][0][0], 0x22);
            weights[1][0][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[1][1][0], 0x22);
            weights[1][1][1] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[1][0][0], 0x22);
            weights[0][1][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[0][1][0], 0x33);
            weights[0][0][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[0][0][0], 0x33);
            weights[1][1][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[1][1][0], 0x33);
            weights[1][0][0] = _mm256_permute2f128_pd(_mm256_undefined_pd(),
                                                      weights[1][0][0], 0x33);
#endif
#ifdef __FMA4__
            _mm_storeu_ps(
                outData,
                _mm256_cvtpd_ps(_mm256_add_pd(
                    _mm256_add_pd(
                        _mm256_macc_pd(_mm256_cvtps_pd(upLeftA),
                                       weights[0][0][0],
                                       _mm256_mul_pd(_mm256_cvtps_pd(upRightA),
                                                     weights[0][0][1])),
                        _mm256_macc_pd(_mm256_cvtps_pd(lowLeftA),
                                       weights[0][1][0],
                                       _mm256_mul_pd(_mm256_cvtps_pd(lowRightA),
                                                     weights[0][1][1]))),
                    _mm256_add_pd(
                        _mm256_macc_pd(_mm256_cvtps_pd(upLeftB),
                                       weights[1][0][0],
                                       _mm256_mul_pd(_mm256_cvtps_pd(upRightB),
                                                     weights[1][0][1])),
                        _mm256_macc_pd(_mm256_cvtps_pd(lowLeftB),
                                       weights[1][1][0],
                                       _mm256_mul_pd(_mm256_cvtps_pd(lowRightB),
                                                     weights[1][1][1]))))));
#elif defined(__FMA__)
            _mm_storeu_ps(
                outData,
                _mm256_cvtpd_ps(_mm256_add_pd(
                    _mm256_add_pd(
                        _mm256_fmadd_pd(_mm256_cvtps_pd(upLeftA),
                                        weights[0][0][0],
                                        _mm256_mul_pd(_mm256_cvtps_pd(upRightA),
                                                      weights[0][0][1])),
                        _mm256_fmadd_pd(
                            _mm256_cvtps_pd(lowLeftA), weights[0][1][0],
                            _mm256_mul_pd(_mm256_cvtps_pd(lowRightA),
                                          weights[0][1][1]))),
                    _mm256_add_pd(
                        _mm256_fmadd_pd(_mm256_cvtps_pd(upLeftB),
                                        weights[1][0][0],
                                        _mm256_mul_pd(_mm256_cvtps_pd(upRightB),
                                                      weights[1][0][1])),
                        _mm256_fmadd_pd(
                            _mm256_cvtps_pd(lowLeftB), weights[1][1][0],
                            _mm256_mul_pd(_mm256_cvtps_pd(lowRightB),
                                          weights[1][1][1]))))));
#else
            _mm_storeu_ps(
                outData,
                _mm256_cvtpd_ps(_mm256_add_pd(
                    _mm256_add_pd(
                        _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(upLeftA),
                                                    weights[0][0][0]),
                                      _mm256_mul_pd(_mm256_cvtps_pd(upRightA),
                                                    weights[0][0][1])),
                        _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(lowLeftA),
                                                    weights[0][1][0]),
                                      _mm256_mul_pd(_mm256_cvtps_pd(lowRightA),
                                                    weights[0][1][1]))),
                    _mm256_add_pd(
                        _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(upLeftB),
                                                    weights[1][0][0]),
                                      _mm256_mul_pd(_mm256_cvtps_pd(upRightB),
                                                    weights[1][0][1])),
                        _mm256_add_pd(_mm256_mul_pd(_mm256_cvtps_pd(lowLeftB),
                                                    weights[1][1][0]),
                                      _mm256_mul_pd(_mm256_cvtps_pd(lowRightB),
                                                    weights[1][1][1]))))));
#endif
#else
            __m128d alphaBeta = _mm_cvtps_pd(fracCoord); // b a
            __m128d invAlphaBeta =
                _mm_sub_pd(_mm_set1_pd(1.0), alphaBeta); // 1-b 1-a
            __m128d gamma =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), fracCoord));
#ifdef __SSE3__
            gamma = _mm_movedup_pd(gamma);
#else
            gamma = _mm_unpacklo_pd(gamma, gamma);
#endif
            __m128d invGamma = _mm_sub_pd(_mm_set1_pd(1.0), gamma);
            __m128d weights[4];

            // <1-a b> * <1-b 1-a> = <(1-a)(1-b) (1-a)b> <00 10>
            weights[0] = _mm_mul_pd(
                _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(1, 0)),
                invAlphaBeta);
            // <b a> * <a 1-b> = <a*b a(1-b)> <11 01>
            weights[1] = _mm_mul_pd(
                alphaBeta,
                _mm_shuffle_pd(invAlphaBeta, alphaBeta, _MM_SHUFFLE2(0, 1)));
            weights[2] = _mm_mul_pd(weights[0], gamma);
            weights[3] = _mm_mul_pd(weights[1], gamma);
            weights[0] = _mm_mul_pd(weights[0], invGamma);
            weights[1] = _mm_mul_pd(weights[1], invGamma);

            __m128d upLeftAL = _mm_cvtps_pd(upLeftA);
            __m128d upLeftAH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeftA));
            __m128d upLeftBL = _mm_cvtps_pd(upLeftB);
            __m128d upLeftBH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upLeftB));
            __m128d upRightAL = _mm_cvtps_pd(upRightA);
            __m128d upRightAH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRightA));
            __m128d upRightBL = _mm_cvtps_pd(upRightB);
            __m128d upRightBH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), upRightB));
            __m128d lowLeftAL = _mm_cvtps_pd(lowLeftA);
            __m128d lowLeftAH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeftA));
            __m128d lowLeftBL = _mm_cvtps_pd(lowLeftB);
            __m128d lowLeftBH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowLeftB));
            __m128d lowRightAL = _mm_cvtps_pd(lowRightA);
            __m128d lowRightAH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRightA));
            __m128d lowRightBL = _mm_cvtps_pd(lowRightB);
            __m128d lowRightBH =
                _mm_cvtps_pd(_mm_movehl_ps(_mm_undefined_ps(), lowRightB));
#ifdef __SSE4_1__
            __m128d rg = _mm_or_pd(
                _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftAL, upLeftAL),
                          0x31),
                _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftAL, upLeftAL),
                          0x32));
            __m128d ba = _mm_or_pd(
                _mm_dp_pd(weights[0], _mm_unpacklo_pd(lowLeftAH, upLeftAH),
                          0x31),
                _mm_dp_pd(weights[0], _mm_unpackhi_pd(lowLeftAH, upLeftAH),
                          0x32));
            rg = _mm_add_pd(
                rg,
                _mm_or_pd(
                    _mm_dp_pd(weights[1],
                              _mm_unpacklo_pd(upRightAL, lowRightAL), 0x31),
                    _mm_dp_pd(weights[1],
                              _mm_unpackhi_pd(upRightAL, lowRightAL), 0x32)));
            ba = _mm_add_pd(
                ba,
                _mm_or_pd(
                    _mm_dp_pd(weights[1],
                              _mm_unpacklo_pd(upRightAH, lowRightAH), 0x31),
                    _mm_dp_pd(weights[1],
                              _mm_unpackhi_pd(upRightAH, lowRightAH), 0x32)));
            rg = _mm_add_pd(
                rg,
                _mm_or_pd(_mm_dp_pd(weights[2],
                                    _mm_unpacklo_pd(lowLeftBL, upLeftBL), 0x31),
                          _mm_dp_pd(weights[2],
                                    _mm_unpackhi_pd(lowLeftBL, upLeftBL),
                                    0x32)));
            ba = _mm_add_pd(
                ba,
                _mm_or_pd(_mm_dp_pd(weights[2],
                                    _mm_unpacklo_pd(lowLeftBH, upLeftBH), 0x31),
                          _mm_dp_pd(weights[2],
                                    _mm_unpackhi_pd(lowLeftBH, upLeftBH),
                                    0x32)));
            rg = _mm_add_pd(
                rg,
                _mm_or_pd(
                    _mm_dp_pd(weights[3],
                              _mm_unpacklo_pd(upRightBL, lowRightBL), 0x31),
                    _mm_dp_pd(weights[3],
                              _mm_unpackhi_pd(upRightBL, lowRightBL), 0x32)));
            ba = _mm_add_pd(
                ba,
                _mm_or_pd(
                    _mm_dp_pd(weights[3],
                              _mm_unpacklo_pd(upRightBH, lowRightBH), 0x31),
                    _mm_dp_pd(weights[3],
                              _mm_unpackhi_pd(upRightBH, lowRightBH), 0x32)));
#else
#ifdef __SSE3__
            __m128d weight010 = _mm_movedup_pd(weights[0]);
            __m128d weight001 = _mm_movedup_pd(weights[1]);
#else
            __m128d weight010 = _mm_unpacklo_pd(weights[0], weights[0]);
            __m128d weight001 = _mm_unpacklo_pd(weights[1], weights[1]);
#endif
            __m128d rg = _mm_add_pd(_mm_mul_pd(weight001, upRightAL),
                                    _mm_mul_pd(weight010, lowLeftAL));
            __m128d ba = _mm_add_pd(_mm_mul_pd(weight001, upRightAH),
                                    _mm_mul_pd(weight010, lowLeftAH));
            __m128d weight000 = _mm_unpackhi_pd(weights[0], weights[0]);
            __m128d weight011 = _mm_unpackhi_pd(weights[1], weights[1]);
            rg = _mm_add_pd(rg,
                            _mm_add_pd(_mm_mul_pd(weight000, upLeftAL),
                                       _mm_mul_pd(weight011, lowRightAL)));
            ba = _mm_add_pd(ba,
                            _mm_add_pd(_mm_mul_pd(weight000, upLeftAH),
                                       _mm_mul_pd(weight011, lowRightAH)));
#ifdef __SSE3__
            __m128d weight110 = _mm_movedup_pd(weights[2]);
            __m128d weight101 = _mm_movedup_pd(weights[3]);
#else
            __m128d weight110 = _mm_unpacklo_pd(weights[2], weights[2]);
            __m128d weight101 = _mm_unpacklo_pd(weights[3], weights[3]);
#endif
            rg = _mm_add_pd(rg,
                            _mm_add_pd(_mm_mul_pd(weight101, upRightBL),
                                       _mm_mul_pd(weight110, lowLeftBL)));
            ba = _mm_add_pd(rg,
                            _mm_add_pd(_mm_mul_pd(weight101, upRightBH),
                                       _mm_mul_pd(weight110, lowLeftBH)));
            __m128d weight100 = _mm_unpackhi_pd(weights[2], weights[2]);
            __m128d weight111 = _mm_unpackhi_pd(weights[3], weights[3]);
            rg = _mm_add_pd(rg,
                            _mm_add_pd(_mm_mul_pd(weight100, upLeftBL),
                                       _mm_mul_pd(weight111, lowRightBL)));
            ba = _mm_add_pd(ba,
                            _mm_add_pd(_mm_mul_pd(weight100, upLeftBH),
                                       _mm_mul_pd(weight111, lowRightBH)));
#endif
            _mm_storeu_ps(outData,
                          _mm_movelh_ps(_mm_cvtpd_ps(rg), _mm_cvtpd_ps(ba)));
#endif
            if (NULL == containsDenorms) _mm_setcsr(mxcsr);
        }

        return returnVal;
    }
}
#else
FloatPixel sample_image_pixel_float_offset(
    void *imageData, image_descriptor *imageInfo, float x, float y, float z,
    float xAddressOffset, float yAddressOffset, float zAddressOffset,
    image_sampler_data *imageSampler, float *outData, int verbose,
    int *containsDenorms, int lod)
{
    AddressFn adFn = sAddressingTable[imageSampler];
    FloatPixel returnVal;
    size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
           depth_lod = imageInfo->depth;
    size_t slice_pitch_lod = 0, row_pitch_lod = 0;

    if (imageInfo->num_mip_levels > 1)
    {
        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE3D:
                depth_lod =
                    (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                height_lod =
                    (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            default:
                width_lod =
                    (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
        }
        row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
        if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
            slice_pitch_lod = row_pitch_lod;
        else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
                 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
            slice_pitch_lod = row_pitch_lod * height_lod;
    }
    else
    {
        slice_pitch_lod = imageInfo->slicePitch;
        row_pitch_lod = imageInfo->rowPitch;
    }

    if (containsDenorms) *containsDenorms = 0;

    if (imageSampler->normalized_coords)
    {

        // We need to unnormalize our coordinates differently depending on
        // the image type, but 'x' is always processed the same way.
        x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod,
                                   imageSampler->addressing_mode, verbose);

        switch (imageInfo->type)
        {

                // The image array types require special care:

            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                z = 0; // don't care -- unused for 1D arrays
                break;

            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                y = unnormalize_coordinate(
                    "y", y, yAddressOffset, (float)height_lod,
                    imageSampler->addressing_mode, verbose);
                break;

                // Everybody else:

            default:
                y = unnormalize_coordinate(
                    "y", y, yAddressOffset, (float)height_lod,
                    imageSampler->addressing_mode, verbose);
                z = unnormalize_coordinate(
                    "z", z, zAddressOffset, (float)depth_lod,
                    imageSampler->addressing_mode, verbose);
        }
    }
    else if (verbose)
    {

        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                log_info("Starting coordinate: %f, array index %f\n", x, y);
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                log_info("Starting coordinate: %f, %f, array index %f\n", x, y,
                         z);
                break;
            case CL_MEM_OBJECT_IMAGE1D:
            case CL_MEM_OBJECT_IMAGE1D_BUFFER:
                log_info("Starting coordinate: %f\n", x);
                break;
            case CL_MEM_OBJECT_IMAGE2D:
                log_info("Starting coordinate: %f, %f\n", x, y);
                break;
            case CL_MEM_OBJECT_IMAGE3D:
            default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z);
        }
    }

    // At this point, we have unnormalized coordinates.

    if (imageSampler->filter_mode == CL_FILTER_NEAREST)
    {
        int ix, iy, iz;

        // We apply the addressing function to the now-unnormalized
        // coordinates.  Note that the array cases again require special
        // care, per section 8.4 in the OpenCL 1.2 Specification.

        ix = adFn(static_cast<int>(floorf(x)), width_lod);

        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                iy = static_cast<int>(calculate_array_index(
                    y, (float)(imageInfo->arraySize - 1)));
                iz = 0;
                if (verbose)
                {
                    log_info("\tArray index %f evaluates to %d\n", y, iy);
                }
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                iy = adFn(static_cast<int>(floorf(y)), height_lod);
                iz = static_cast<int>(calculate_array_index(
                    z, (float)(imageInfo->arraySize - 1)));
                if (verbose)
                {
                    log_info("\tArray index %f evaluates to %d\n", z, iz);
                }
                break;
            default:
                iy = adFn(static_cast<int>(floorf(y)), height_lod);
                if (depth_lod != 0)
                    iz = adFn(static_cast<int>(floorf(z)), depth_lod);
                else
                    iz = 0;
        }

        if (verbose)
        {
            if (iz)
                log_info(
                    "\tReference integer coords calculated: { %d, %d, %d }\n",
                    ix, iy, iz);
            else
                log_info("\tReference integer coords calculated: { %d, %d }\n",
                         ix, iy);
        }

        read_image_pixel_float(imageData, imageInfo, ix, iy, iz, outData, lod);
        check_for_denorms(outData, containsDenorms);
        for (int i = 0; i < 4; i++) returnVal.p[i] = fabsf(outData[i]);
        return returnVal;
    }
    else
    {
        // Linear filtering cases.

        size_t width = width_lod, height = height_lod, depth = depth_lod;

        // Image arrays can use 2D filtering, but require us to walk into the
        // image a certain number of slices before reading.

        if (depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY
            || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
        {
            float array_index = 0;

            size_t layer_offset = 0;

            if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
            {
                array_index =
                    calculate_array_index(z, (float)(imageInfo->arraySize - 1));
                layer_offset = slice_pitch_lod * (size_t)array_index;
            }
            else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
            {
                array_index =
                    calculate_array_index(y, (float)(imageInfo->arraySize - 1));
                layer_offset = slice_pitch_lod * (size_t)array_index;

                // Set up y and height so that the filtering below is correct
                // 1D filtering on a single slice.
                height = 1;
            }

            int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width);
            int y1 = 0;
            int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width);
            int y2 = 0;
            if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D)
                && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER))
            {
                y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height);
                y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height);
            }
            else
            {
                y = 0.5f;
            }

            if (verbose)
            {
                log_info("\tActual integer coords used (i = floor(x-.5)): i0:{ "
                         "%d, %d } and i1:{ %d, %d }\n",
                         x1, y1, x2, y2);
                log_info("\tArray coordinate is %f\n", array_index);
            }

            // Walk to beginning of the 'correct' slice, if needed.
            char *imgPtr = ((char *)imageData) + layer_offset;

            float upLeft[4], upRight[4], lowLeft[4], lowRight[4];
            float maxUp[4], maxLow[4];
            read_image_pixel_float(imgPtr, imageInfo, x1, y1, 0, upLeft, lod);
            read_image_pixel_float(imgPtr, imageInfo, x2, y1, 0, upRight, lod);
            check_for_denorms(upLeft, containsDenorms);
            check_for_denorms(upRight, containsDenorms);
            pixelMax(upLeft, upRight, maxUp);
            read_image_pixel_float(imgPtr, imageInfo, x1, y2, 0, lowLeft, lod);
            read_image_pixel_float(imgPtr, imageInfo, x2, y2, 0, lowRight, lod);
            check_for_denorms(lowLeft, containsDenorms);
            check_for_denorms(lowRight, containsDenorms);
            pixelMax(lowLeft, lowRight, maxLow);
            pixelMax(maxUp, maxLow, returnVal.p);

            if (verbose)
            {
                if (NULL == containsDenorms)
                    log_info("\tSampled pixels (rgba order, denorms flushed to "
                             "zero):\n");
                else
                    log_info("\tSampled pixels (rgba order):\n");
                log_info("\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1],
                         upLeft[2], upLeft[3]);
                log_info("\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1],
                         upRight[2], upRight[3]);
                log_info("\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1],
                         lowLeft[2], lowLeft[3]);
                log_info("\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1],
                         lowRight[2], lowRight[3]);
            }

            bool printMe = false;
            if (x1 <= 0 || x2 <= 0 || x1 >= (int)width - 1
                || x2 >= (int)width - 1)
                printMe = true;
            if (y1 <= 0 || y2 <= 0 || y1 >= (int)height - 1
                || y2 >= (int)height - 1)
                printMe = true;

            double weights[2][2];

            weights[0][0] = weights[0][1] = 1.0 - frac(x - 0.5f);
            weights[1][0] = weights[1][1] = frac(x - 0.5f);
            weights[0][0] *= 1.0 - frac(y - 0.5f);
            weights[1][0] *= 1.0 - frac(y - 0.5f);
            weights[0][1] *= frac(y - 0.5f);
            weights[1][1] *= frac(y - 0.5f);

            if (verbose)
                log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f\n",
                         frac(x - 0.5f), frac(y - 0.5f));

            for (int i = 0; i < 3; i++)
            {
                outData[i] = (float)((upLeft[i] * weights[0][0])
                                     + (upRight[i] * weights[1][0])
                                     + (lowLeft[i] * weights[0][1])
                                     + (lowRight[i] * weights[1][1]));
                // flush subnormal results to zero if necessary
                if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN)
                    outData[i] = copysignf(0.0f, outData[i]);
            }
            outData[3] = (float)((upLeft[3] * weights[0][0])
                                 + (upRight[3] * weights[1][0])
                                 + (lowLeft[3] * weights[0][1])
                                 + (lowRight[3] * weights[1][1]));
            // flush subnormal results to zero if necessary
            if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN)
                outData[3] = copysignf(0.0f, outData[3]);
        }
        else
        {
            // 3D linear filtering
            int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width_lod);
            int y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height_lod);
            int z1 = adFn(static_cast<int>(floorf(z - 0.5f)), depth_lod);
            int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width_lod);
            int y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height_lod);
            int z2 = adFn(static_cast<int>(floorf(z - 0.5f) + 1), depth_lod);

            if (verbose)
                log_info("\tActual integer coords used (i = floor(x-.5)): "
                         "i0:{%d, %d, %d} and i1:{%d, %d, %d}\n",
                         x1, y1, z1, x2, y2, z2);

            float upLeftA[4], upRightA[4], lowLeftA[4], lowRightA[4];
            float upLeftB[4], upRightB[4], lowLeftB[4], lowRightB[4];
            float pixelMaxA[4], pixelMaxB[4];
            read_image_pixel_float(imageData, imageInfo, x1, y1, z1, upLeftA,
                                   lod);
            read_image_pixel_float(imageData, imageInfo, x2, y1, z1, upRightA,
                                   lod);
            check_for_denorms(upLeftA, containsDenorms);
            check_for_denorms(upRightA, containsDenorms);
            pixelMax(upLeftA, upRightA, pixelMaxA);
            read_image_pixel_float(imageData, imageInfo, x1, y2, z1, lowLeftA,
                                   lod);
            read_image_pixel_float(imageData, imageInfo, x2, y2, z1, lowRightA,
                                   lod);
            check_for_denorms(lowLeftA, containsDenorms);
            check_for_denorms(lowRightA, containsDenorms);
            pixelMax(lowLeftA, lowRightA, pixelMaxB);
            pixelMax(pixelMaxA, pixelMaxB, returnVal.p);
            read_image_pixel_float(imageData, imageInfo, x1, y1, z2, upLeftB,
                                   lod);
            read_image_pixel_float(imageData, imageInfo, x2, y1, z2, upRightB,
                                   lod);
            check_for_denorms(upLeftB, containsDenorms);
            check_for_denorms(upRightB, containsDenorms);
            pixelMax(upLeftB, upRightB, pixelMaxA);
            read_image_pixel_float(imageData, imageInfo, x1, y2, z2, lowLeftB,
                                   lod);
            read_image_pixel_float(imageData, imageInfo, x2, y2, z2, lowRightB,
                                   lod);
            check_for_denorms(lowLeftB, containsDenorms);
            check_for_denorms(lowRightB, containsDenorms);
            pixelMax(lowLeftB, lowRightB, pixelMaxB);
            pixelMax(pixelMaxA, pixelMaxB, pixelMaxA);
            pixelMax(pixelMaxA, returnVal.p, returnVal.p);

            if (verbose)
            {
                if (NULL == containsDenorms)
                    log_info("\tSampled pixels (rgba order, denorms flushed to "
                             "zero):\n");
                else
                    log_info("\tSampled pixels (rgba order):\n");
                log_info("\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1],
                         upLeftA[2], upLeftA[3]);
                log_info("\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1],
                         upRightA[2], upRightA[3]);
                log_info("\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1],
                         lowLeftA[2], lowLeftA[3]);
                log_info("\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0],
                         lowRightA[1], lowRightA[2], lowRightA[3]);
                log_info("\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1],
                         upLeftB[2], upLeftB[3]);
                log_info("\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1],
                         upRightB[2], upRightB[3]);
                log_info("\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1],
                         lowLeftB[2], lowLeftB[3]);
                log_info("\t\tp111: %f, %f, %f, %f\n", lowRightB[0],
                         lowRightB[1], lowRightB[2], lowRightB[3]);
            }

            double weights[2][2][2];

            float a = frac(x - 0.5f), b = frac(y - 0.5f), c = frac(z - 0.5f);
            weights[0][0][0] = weights[0][1][0] = weights[0][0][1] =
                weights[0][1][1] = 1.f - a;
            weights[1][0][0] = weights[1][1][0] = weights[1][0][1] =
                weights[1][1][1] = a;
            weights[0][0][0] *= 1.f - b;
            weights[1][0][0] *= 1.f - b;
            weights[0][0][1] *= 1.f - b;
            weights[1][0][1] *= 1.f - b;
            weights[0][1][0] *= b;
            weights[1][1][0] *= b;
            weights[0][1][1] *= b;
            weights[1][1][1] *= b;
            weights[0][0][0] *= 1.f - c;
            weights[0][1][0] *= 1.f - c;
            weights[1][0][0] *= 1.f - c;
            weights[1][1][0] *= 1.f - c;
            weights[0][0][1] *= c;
            weights[0][1][1] *= c;
            weights[1][0][1] *= c;
            weights[1][1][1] *= c;

            if (verbose)
                log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f, "
                         "frac( z - 0.5f ) = %f\n",
                         frac(x - 0.5f), frac(y - 0.5f), frac(z - 0.5f));

            for (int i = 0; i < 3; i++)
            {
                outData[i] = (float)((upLeftA[i] * weights[0][0][0])
                                     + (upRightA[i] * weights[1][0][0])
                                     + (lowLeftA[i] * weights[0][1][0])
                                     + (lowRightA[i] * weights[1][1][0])
                                     + (upLeftB[i] * weights[0][0][1])
                                     + (upRightB[i] * weights[1][0][1])
                                     + (lowLeftB[i] * weights[0][1][1])
                                     + (lowRightB[i] * weights[1][1][1]));
                // flush subnormal results to zero if necessary
                if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN)
                    outData[i] = copysignf(0.0f, outData[i]);
            }
            outData[3] = (float)((upLeftA[3] * weights[0][0][0])
                                 + (upRightA[3] * weights[1][0][0])
                                 + (lowLeftA[3] * weights[0][1][0])
                                 + (lowRightA[3] * weights[1][1][0])
                                 + (upLeftB[3] * weights[0][0][1])
                                 + (upRightB[3] * weights[1][0][1])
                                 + (lowLeftB[3] * weights[0][1][1])
                                 + (lowRightB[3] * weights[1][1][1]));
            // flush subnormal results to zero if necessary
            if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN)
                outData[3] = copysignf(0.0f, outData[3]);
        }

        return returnVal;
    }
}
#endif

FloatPixel sample_image_pixel_float_offset(
    void *imageData, image_descriptor *imageInfo, float x, float y, float z,
    float xAddressOffset, float yAddressOffset, float zAddressOffset,
    image_sampler_data *imageSampler, float *outData, int verbose,
    int *containsDenorms)
{
    return sample_image_pixel_float_offset(
        imageData, imageInfo, x, y, z, xAddressOffset, yAddressOffset,
        zAddressOffset, imageSampler, outData, verbose, containsDenorms, 0);
}


int debug_find_vector_in_image(void *imagePtr, image_descriptor *imageInfo,
                               void *vectorToFind, size_t vectorSize, int *outX,
                               int *outY, int *outZ, size_t lod)
{
    int foundCount = 0;
    char *iPtr = (char *)imagePtr;
    size_t width;
    size_t depth;
    size_t height;
    size_t row_pitch;
    size_t slice_pitch;

    switch (imageInfo->type)
    {
        case CL_MEM_OBJECT_IMAGE1D:
            width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
            height = 1;
            depth = 1;
            break;
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
            height = 1;
            depth = imageInfo->arraySize;
            break;
        case CL_MEM_OBJECT_IMAGE2D:
            width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
            height =
                (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            depth = 1;
            break;
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
            height =
                (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            depth = imageInfo->arraySize;
            break;
        case CL_MEM_OBJECT_IMAGE3D:
            width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
            height =
                (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
            break;
    }

    row_pitch = width * get_pixel_size(imageInfo->format);
    slice_pitch = row_pitch * height;

    for (size_t z = 0; z < depth; z++)
    {
        for (size_t y = 0; y < height; y++)
        {
            for (size_t x = 0; x < width; x++)
            {
                if (memcmp(iPtr, vectorToFind, vectorSize) == 0)
                {
                    if (foundCount == 0)
                    {
                        *outX = (int)x;
                        if (outY != NULL) *outY = (int)y;
                        if (outZ != NULL) *outZ = (int)z;
                    }
                    foundCount++;
                }
                iPtr += vectorSize;
            }
            iPtr += row_pitch - (width * vectorSize);
        }
        iPtr += slice_pitch - (height * row_pitch);
    }
    return foundCount;
}

int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
                              unsigned int *valuesToFind, int *outX, int *outY,
                              int *outZ, int lod)
{
    char vectorToFind[4 * 4];
    size_t vectorSize = get_format_channel_count(imageInfo->format);


    if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8)
    {
        unsigned char *p = (unsigned char *)vectorToFind;
        for (unsigned int i = 0; i < vectorSize; i++)
            p[i] = (unsigned char)valuesToFind[i];
    }
    else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16)
    {
        unsigned short *p = (unsigned short *)vectorToFind;
        for (unsigned int i = 0; i < vectorSize; i++)
            p[i] = (unsigned short)valuesToFind[i];
        vectorSize *= 2;
    }
    else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32)
    {
        unsigned int *p = (unsigned int *)vectorToFind;
        for (unsigned int i = 0; i < vectorSize; i++)
            p[i] = (unsigned int)valuesToFind[i];
        vectorSize *= 4;
    }
    else
    {
        log_info("WARNING: Unable to search for debug pixel: invalid image "
                 "format\n");
        return false;
    }
    return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
                                      vectorSize, outX, outY, outZ, lod);
}

int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
                              int *valuesToFind, int *outX, int *outY,
                              int *outZ, int lod)
{
    char vectorToFind[4 * 4];
    size_t vectorSize = get_format_channel_count(imageInfo->format);

    if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT8)
    {
        char *p = (char *)vectorToFind;
        for (unsigned int i = 0; i < vectorSize; i++)
            p[i] = (char)valuesToFind[i];
    }
    else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT16)
    {
        short *p = (short *)vectorToFind;
        for (unsigned int i = 0; i < vectorSize; i++)
            p[i] = (short)valuesToFind[i];
        vectorSize *= 2;
    }
    else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT32)
    {
        int *p = (int *)vectorToFind;
        for (unsigned int i = 0; i < vectorSize; i++)
            p[i] = (int)valuesToFind[i];
        vectorSize *= 4;
    }
    else
    {
        log_info("WARNING: Unable to search for debug pixel: invalid image "
                 "format\n");
        return false;
    }
    return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
                                      vectorSize, outX, outY, outZ, lod);
}

int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
                              float *valuesToFind, int *outX, int *outY,
                              int *outZ, int lod)
{
    char vectorToFind[4 * 4];
    float swizzled[4];
    memcpy(swizzled, valuesToFind, sizeof(swizzled));
    size_t vectorSize = get_pixel_size(imageInfo->format);
    pack_image_pixel(swizzled, imageInfo->format, vectorToFind);
    return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
                                      vectorSize, outX, outY, outZ, lod);
}

template <class T>
void swizzle_vector_for_image(T *srcVector, const cl_image_format *imageFormat)
{
    T temp;
    switch (imageFormat->image_channel_order)
    {
        case CL_A: srcVector[0] = srcVector[3]; break;
        case CL_R:
        case CL_Rx:
        case CL_RG:
        case CL_RGx:
        case CL_RGB:
        case CL_RGBx:
        case CL_RGBA:
        case CL_sRGB:
        case CL_sRGBx:
        case CL_sRGBA: break;
        case CL_RA: srcVector[1] = srcVector[3]; break;
        case CL_ARGB:
            temp = srcVector[3];
            srcVector[3] = srcVector[2];
            srcVector[2] = srcVector[1];
            srcVector[1] = srcVector[0];
            srcVector[0] = temp;
            break;
        case CL_ABGR:
            temp = srcVector[3];
            srcVector[3] = srcVector[0];
            srcVector[0] = temp;
            temp = srcVector[2];
            srcVector[2] = srcVector[1];
            srcVector[1] = temp;
            break;
        case CL_BGRA:
        case CL_sBGRA:
            temp = srcVector[0];
            srcVector[0] = srcVector[2];
            srcVector[2] = temp;
            break;
        case CL_INTENSITY:
            srcVector[3] = srcVector[0];
            srcVector[2] = srcVector[0];
            srcVector[1] = srcVector[0];
            break;
        case CL_LUMINANCE:
            srcVector[2] = srcVector[0];
            srcVector[1] = srcVector[0];
            break;
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE:
            temp = srcVector[3];
            srcVector[3] = srcVector[2];
            srcVector[2] = srcVector[1];
            srcVector[1] = srcVector[0];
            srcVector[0] = temp;
            break;
#endif
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE:
            temp = srcVector[0];
            srcVector[0] = srcVector[2];
            srcVector[2] = temp;
            break;
#endif
    }
}

#define SATURATE(v, min, max) (v < min ? min : (v > max ? max : v))

void pack_image_pixel(unsigned int *srcVector,
                      const cl_image_format *imageFormat, void *outData)
{
    swizzle_vector_for_image<unsigned int>(srcVector, imageFormat);
    size_t channelCount = get_format_channel_count(imageFormat);

    switch (imageFormat->image_channel_data_type)
    {
        case CL_UNSIGNED_INT8: {
            unsigned char *ptr = (unsigned char *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = (unsigned char)SATURATE(srcVector[i], 0, 255);
            break;
        }
        case CL_UNSIGNED_INT16: {
            unsigned short *ptr = (unsigned short *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = (unsigned short)SATURATE(srcVector[i], 0, 65535);
            break;
        }
        case CL_UNSIGNED_INT32: {
            unsigned int *ptr = (unsigned int *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = (unsigned int)srcVector[i];
            break;
        }
        default: break;
    }
}

void pack_image_pixel(int *srcVector, const cl_image_format *imageFormat,
                      void *outData)
{
    swizzle_vector_for_image<int>(srcVector, imageFormat);
    size_t chanelCount = get_format_channel_count(imageFormat);

    switch (imageFormat->image_channel_data_type)
    {
        case CL_SIGNED_INT8: {
            char *ptr = (char *)outData;
            for (unsigned int i = 0; i < chanelCount; i++)
                ptr[i] = (char)SATURATE(srcVector[i], -128, 127);
            break;
        }
        case CL_SIGNED_INT16: {
            short *ptr = (short *)outData;
            for (unsigned int i = 0; i < chanelCount; i++)
                ptr[i] = (short)SATURATE(srcVector[i], -32768, 32767);
            break;
        }
        case CL_SIGNED_INT32: {
            int *ptr = (int *)outData;
            for (unsigned int i = 0; i < chanelCount; i++)
                ptr[i] = (int)srcVector[i];
            break;
        }
        default: break;
    }
}

cl_int round_to_even(float v)
{
    // clamp overflow
    if (v >= -(float)CL_INT_MIN) return CL_INT_MAX;
    if (v <= (float)CL_INT_MIN) return CL_INT_MIN;

    // round fractional values to integer value
    if (fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23))
    {
        static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23),
                                        MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) };
        float magicVal = magic[v < 0.0f];
        v += magicVal;
        v -= magicVal;
    }

    return (cl_int)v;
}

void pack_image_pixel(float *srcVector, const cl_image_format *imageFormat,
                      void *outData)
{
    swizzle_vector_for_image<float>(srcVector, imageFormat);
    size_t channelCount = get_format_channel_count(imageFormat);
    switch (imageFormat->image_channel_data_type)
    {
        case CL_HALF_FLOAT: {
            cl_half *ptr = (cl_half *)outData;

            switch (gFloatToHalfRoundingMode)
            {
                case kRoundToNearestEven:
                    for (unsigned int i = 0; i < channelCount; i++)
                        ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTE);
                    break;
                case kRoundTowardZero:
                    for (unsigned int i = 0; i < channelCount; i++)
                        ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTZ);
                    break;
                default:
                    log_error("ERROR: Test internal error -- unhandled or "
                              "unknown float->half rounding mode.\n");
                    exit(-1);
                    break;
            }
            break;
        }

        case CL_FLOAT: {
            cl_float *ptr = (cl_float *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = srcVector[i];
            break;
        }

        case CL_SNORM_INT8: {
            cl_char *ptr = (cl_char *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] =
                    (cl_char)NORMALIZE_SIGNED(srcVector[i], -127.0f, 127.f);
            break;
        }
        case CL_SNORM_INT16: {
            cl_short *ptr = (cl_short *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] =
                    (short)NORMALIZE_SIGNED(srcVector[i], -32767.f, 32767.f);
            break;
        }
        case CL_UNORM_INT8: {
            cl_uchar *ptr = (cl_uchar *)outData;
            if (is_sRGBA_order(imageFormat->image_channel_order))
            {
                ptr[0] = (unsigned char)(sRGBmap(srcVector[0]) + 0.5);
                ptr[1] = (unsigned char)(sRGBmap(srcVector[1]) + 0.5);
                ptr[2] = (unsigned char)(sRGBmap(srcVector[2]) + 0.5);
                if (channelCount == 4)
                    ptr[3] = (unsigned char)NORMALIZE(srcVector[3], 255.f);
            }
            else
            {
                for (unsigned int i = 0; i < channelCount; i++)
                    ptr[i] = (unsigned char)NORMALIZE(srcVector[i], 255.f);
            }
#ifdef CL_1RGB_APPLE
            if (imageFormat->image_channel_order == CL_1RGB_APPLE)
                ptr[0] = 255.0f;
#endif
#ifdef CL_BGR1_APPLE
            if (imageFormat->image_channel_order == CL_BGR1_APPLE)
                ptr[3] = 255.0f;
#endif
            break;
        }
        case CL_UNORM_INT16: {
            cl_ushort *ptr = (cl_ushort *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = (unsigned short)NORMALIZE(srcVector[i], 65535.f);
            break;
        }
        case CL_UNORM_SHORT_555: {
            cl_ushort *ptr = (cl_ushort *)outData;
            ptr[0] =
                (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 10)
                | (((unsigned short)NORMALIZE(srcVector[1], 31.f) & 31) << 5)
                | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0);
            break;
        }
        case CL_UNORM_SHORT_565: {
            cl_ushort *ptr = (cl_ushort *)outData;
            ptr[0] =
                (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 11)
                | (((unsigned short)NORMALIZE(srcVector[1], 63.f) & 63) << 5)
                | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0);
            break;
        }
        case CL_UNORM_INT_101010: {
            cl_uint *ptr = (cl_uint *)outData;
            ptr[0] =
                (((unsigned int)NORMALIZE(srcVector[0], 1023.f) & 1023) << 20)
                | (((unsigned int)NORMALIZE(srcVector[1], 1023.f) & 1023) << 10)
                | (((unsigned int)NORMALIZE(srcVector[2], 1023.f) & 1023) << 0);
            break;
        }
        case CL_SIGNED_INT8: {
            cl_char *ptr = (cl_char *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] =
                    (cl_char)CONVERT_INT(srcVector[i], -127.0f, 127.f, 127);
            break;
        }
        case CL_SIGNED_INT16: {
            cl_short *ptr = (cl_short *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] =
                    (short)CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767);
            break;
        }
        case CL_SIGNED_INT32: {
            cl_int *ptr = (cl_int *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = round_to_even(srcVector[i]);
            break;
        }
        case CL_UNSIGNED_INT8: {
            cl_uchar *ptr = (cl_uchar *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] =
                    (cl_uchar)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX);
            break;
        }
        case CL_UNSIGNED_INT16: {
            cl_ushort *ptr = (cl_ushort *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = (cl_ushort)CONVERT_UINT(srcVector[i], 32767.f,
                                                 CL_USHRT_MAX);
            break;
        }
        case CL_UNSIGNED_INT32: {
            cl_uint *ptr = (cl_uint *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
                ptr[i] = (cl_uint)CONVERT_UINT(
                    srcVector[i],
                    MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23),
                    CL_UINT_MAX);
            break;
        }
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE: {
            cl_ushort *ptr = (cl_ushort *)outData;
            for (unsigned int i = 0; i < channelCount; i++)
            {
                cl_float f = fmaxf(srcVector[i], -1.0f);
                f = fminf(f, 3.0f);
                cl_int d = rintf(f * 0x1.0p14f);
                d += 16384;
                if (d > CL_USHRT_MAX) d = CL_USHRT_MAX;
                ptr[i] = d;
            }
            break;
        }
#endif
        default:
            log_error("INTERNAL ERROR: unknown format (%d)\n",
                      imageFormat->image_channel_data_type);
            exit(-1);
            break;
    }
}

void pack_image_pixel_error(const float *srcVector,
                            const cl_image_format *imageFormat,
                            const void *results, float *errors)
{
    size_t channelCount = get_format_channel_count(imageFormat);
    switch (imageFormat->image_channel_data_type)
    {
        case CL_HALF_FLOAT: {
            const cl_half *ptr = (const cl_half *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = Ulp_Error_Half(ptr[i], srcVector[i]);

            break;
        }

        case CL_FLOAT: {
            const cl_ushort *ptr = (const cl_ushort *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = Ulp_Error(ptr[i], srcVector[i]);

            break;
        }

        case CL_SNORM_INT8: {
            const cl_char *ptr = (const cl_char *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = ptr[i]
                    - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -127.0f, 127.f);

            break;
        }
        case CL_SNORM_INT16: {
            const cl_short *ptr = (const cl_short *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = ptr[i]
                    - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -32767.f,
                                                 32767.f);

            break;
        }
        case CL_UNORM_INT8: {
            const cl_uchar *ptr = (const cl_uchar *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 255.f);

            break;
        }
        case CL_UNORM_INT16: {
            const cl_ushort *ptr = (const cl_ushort *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 65535.f);

            break;
        }
        case CL_UNORM_SHORT_555: {
            const cl_ushort *ptr = (const cl_ushort *)results;

            errors[0] =
                ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f);
            errors[1] =
                ((ptr[0] >> 5) & 31) - NORMALIZE_UNROUNDED(srcVector[1], 31.f);
            errors[2] =
                ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f);

            break;
        }
        case CL_UNORM_SHORT_565: {
            const cl_ushort *ptr = (const cl_ushort *)results;

            errors[0] =
                ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f);
            errors[1] =
                ((ptr[0] >> 5) & 63) - NORMALIZE_UNROUNDED(srcVector[1], 63.f);
            errors[2] =
                ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f);

            break;
        }
        case CL_UNORM_INT_101010: {
            const cl_uint *ptr = (const cl_uint *)results;

            errors[0] = ((ptr[0] >> 20) & 1023)
                - NORMALIZE_UNROUNDED(srcVector[0], 1023.f);
            errors[1] = ((ptr[0] >> 10) & 1023)
                - NORMALIZE_UNROUNDED(srcVector[1], 1023.f);
            errors[2] = ((ptr[0] >> 0) & 1023)
                - NORMALIZE_UNROUNDED(srcVector[2], 1023.f);

            break;
        }
        case CL_SIGNED_INT8: {
            const cl_char *ptr = (const cl_char *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] =
                    ptr[i] - CONVERT_INT(srcVector[i], -127.0f, 127.f, 127);

            break;
        }
        case CL_SIGNED_INT16: {
            const cl_short *ptr = (const cl_short *)results;
            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = ptr[i]
                    - CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767);
            break;
        }
        case CL_SIGNED_INT32: {
            const cl_int *ptr = (const cl_int *)results;
            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = (cl_float)((cl_long)ptr[i]
                                       - (cl_long)round_to_even(srcVector[i]));
            break;
        }
        case CL_UNSIGNED_INT8: {
            const cl_uchar *ptr = (const cl_uchar *)results;
            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = static_cast<float>(
                    (cl_int)ptr[i]
                    - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX));
            break;
        }
        case CL_UNSIGNED_INT16: {
            const cl_ushort *ptr = (const cl_ushort *)results;
            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = static_cast<float>(
                    (cl_int)ptr[i]
                    - (cl_int)CONVERT_UINT(srcVector[i], 32767.f,
                                           CL_USHRT_MAX));
            break;
        }
        case CL_UNSIGNED_INT32: {
            const cl_uint *ptr = (const cl_uint *)results;
            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = (cl_float)(
                    (cl_long)ptr[i]
                    - (cl_long)CONVERT_UINT(
                        srcVector[i],
                        MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23),
                        CL_UINT_MAX));
            break;
        }
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE: {
            const cl_ushort *ptr = (const cl_ushort *)results;

            for (unsigned int i = 0; i < channelCount; i++)
                errors[i] = ptr[i]
                    - NORMALIZE_SIGNED_UNROUNDED(((int)srcVector[i] - 16384),
                                                 -16384.f, 49151.f);

            break;
        }
#endif
        default:
            log_error("INTERNAL ERROR: unknown format (%d)\n",
                      imageFormat->image_channel_data_type);
            exit(-1);
            break;
    }
}


//
//  Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT
//  This should be called lazily before attempting to verify image writes,
//  otherwise an error will occur.
//
int DetectFloatToHalfRoundingMode(
    cl_command_queue q) // Returns CL_SUCCESS on success
{
    cl_int err = CL_SUCCESS;

    if (gFloatToHalfRoundingMode == kDefaultRoundingMode)
    {
        // Some numbers near 0.5f, that we look at to see how the values are
        // rounded.
        static const cl_uint inData[4 * 4] = {
            0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U,
            0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U,
            0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U,
            0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U
        };
        static const size_t count = sizeof(inData) / (4 * sizeof(inData[0]));
        const float *inp = (const float *)inData;
        cl_context context = NULL;

        // Create an input buffer
        err = clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(context),
                                    &context, NULL);
        if (err)
        {
            log_error("Error:  could not get context from command queue in "
                      "DetectFloatToHalfRoundingMode  (%d)",
                      err);
            return err;
        }

        cl_mem inBuf = clCreateBuffer(context,
                                      CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR
                                          | CL_MEM_ALLOC_HOST_PTR,
                                      sizeof(inData), (void *)inData, &err);
        if (NULL == inBuf || err)
        {
            log_error("Error:  could not create input buffer in "
                      "DetectFloatToHalfRoundingMode  (err: %d)",
                      err);
            return err;
        }

        // Create a small output image
        cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT };
        cl_mem outImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &fmt,
                                          count, 1, 0, NULL, &err);
        if (NULL == outImage || err)
        {
            log_error("Error:  could not create half float out image in "
                      "DetectFloatToHalfRoundingMode  (err: %d)",
                      err);
            clReleaseMemObject(inBuf);
            return err;
        }

        // Create our program, and a kernel
        const char *kernelSource[1] = {
            "kernel void detect_round( global float4 *in, write_only image2d_t "
            "out )\n"
            "{\n"
            "   write_imagef( out, (int2)(get_global_id(0),0), "
            "in[get_global_id(0)] );\n"
            "}\n"
        };

        clProgramWrapper program;
        clKernelWrapper kernel;
        err = create_single_kernel_helper(context, &program, &kernel, 1,
                                          kernelSource, "detect_round");

        if (NULL == program || err)
        {
            log_error("Error:  could not create program in "
                      "DetectFloatToHalfRoundingMode (err: %d)",
                      err);
            clReleaseMemObject(inBuf);
            clReleaseMemObject(outImage);
            return err;
        }

        cl_device_id device = NULL;
        err = clGetCommandQueueInfo(q, CL_QUEUE_DEVICE, sizeof(device), &device,
                                    NULL);
        if (err)
        {
            log_error("Error:  could not get device from command queue in "
                      "DetectFloatToHalfRoundingMode  (%d)",
                      err);
            clReleaseMemObject(inBuf);
            clReleaseMemObject(outImage);
            return err;
        }

        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inBuf);
        if (err)
        {
            log_error("Error: could not set argument 0 of kernel in "
                      "DetectFloatToHalfRoundingMode (%d)",
                      err);
            clReleaseMemObject(inBuf);
            clReleaseMemObject(outImage);
            return err;
        }

        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outImage);
        if (err)
        {
            log_error("Error: could not set argument 1 of kernel in "
                      "DetectFloatToHalfRoundingMode (%d)",
                      err);
            clReleaseMemObject(inBuf);
            clReleaseMemObject(outImage);
            return err;
        }

        // Run the kernel
        size_t global_work_size = count;
        err = clEnqueueNDRangeKernel(q, kernel, 1, NULL, &global_work_size,
                                     NULL, 0, NULL, NULL);
        if (err)
        {
            log_error("Error: could not enqueue kernel in "
                      "DetectFloatToHalfRoundingMode (%d)",
                      err);
            clReleaseMemObject(inBuf);
            clReleaseMemObject(outImage);
            return err;
        }

        // read the results
        cl_half outBuf[count * 4];
        memset(outBuf, -1, sizeof(outBuf));
        size_t origin[3] = { 0, 0, 0 };
        size_t region[3] = { count, 1, 1 };
        err = clEnqueueReadImage(q, outImage, CL_TRUE, origin, region, 0, 0,
                                 outBuf, 0, NULL, NULL);
        if (err)
        {
            log_error("Error: could not read output image in "
                      "DetectFloatToHalfRoundingMode (%d)",
                      err);
            clReleaseMemObject(inBuf);
            clReleaseMemObject(outImage);
            return err;
        }

        // Generate our list of reference results
        cl_half rte_ref[count * 4];
        cl_half rtz_ref[count * 4];
        for (size_t i = 0; i < 4 * count; i++)
        {
            rte_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTE);
            rtz_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTZ);
        }

        // Verify that we got something in either rtz or rte mode
        if (0 == memcmp(rte_ref, outBuf, sizeof(rte_ref)))
        {
            log_info("Autodetected float->half rounding mode to be rte\n");
            gFloatToHalfRoundingMode = kRoundToNearestEven;
        }
        else if (0 == memcmp(rtz_ref, outBuf, sizeof(rtz_ref)))
        {
            log_info("Autodetected float->half rounding mode to be rtz\n");
            gFloatToHalfRoundingMode = kRoundTowardZero;
        }
        else
        {
            log_error("ERROR: float to half conversions proceed with invalid "
                      "rounding mode!\n");
            log_info("\nfor:");
            for (size_t i = 0; i < count; i++)
                log_info(" {%a, %a, %a, %a},", inp[4 * i], inp[4 * i + 1],
                         inp[4 * i + 2], inp[4 * i + 3]);
            log_info("\ngot:");
            for (size_t i = 0; i < count; i++)
                log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
                         outBuf[4 * i], outBuf[4 * i + 1], outBuf[4 * i + 2],
                         outBuf[4 * i + 3]);
            log_info("\nrte:");
            for (size_t i = 0; i < count; i++)
                log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
                         rte_ref[4 * i], rte_ref[4 * i + 1], rte_ref[4 * i + 2],
                         rte_ref[4 * i + 3]);
            log_info("\nrtz:");
            for (size_t i = 0; i < count; i++)
                log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
                         rtz_ref[4 * i], rtz_ref[4 * i + 1], rtz_ref[4 * i + 2],
                         rtz_ref[4 * i + 3]);
            log_info("\n");
            err = -1;
            gFloatToHalfRoundingMode = kRoundingModeCount; // illegal value
        }

        // clean up
        clReleaseMemObject(inBuf);
        clReleaseMemObject(outImage);
        return err;
    }

    // Make sure that the rounding mode was successfully detected, if we checked
    // earlier
    if (gFloatToHalfRoundingMode != kRoundToNearestEven
        && gFloatToHalfRoundingMode != kRoundTowardZero)
        return -2;

    return err;
}

char *create_random_image_data(ExplicitType dataType,
                               image_descriptor *imageInfo,
                               BufferOwningPtr<char> &P, MTdata d,
                               bool image2DFromBuffer)
{
    size_t allocSize, numPixels;
    if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1)
    {
        allocSize = (size_t)(compute_mipmapped_image_size(*imageInfo) * 4
                             * get_explicit_type_size(dataType))
            / get_pixel_size(imageInfo->format);
        numPixels = allocSize / (get_explicit_type_size(dataType) * 4);
    }
    else
    {
        numPixels = (image2DFromBuffer ? imageInfo->rowPitch : imageInfo->width)
            * imageInfo->height * (imageInfo->depth ? imageInfo->depth : 1)
            * (imageInfo->arraySize ? imageInfo->arraySize : 1);
        allocSize = numPixels * 4 * get_explicit_type_size(dataType);
    }

#if 0 // DEBUG
    {
      fprintf(stderr,"--- create_random_image_data:\n");
      fprintf(stderr,"allocSize = %zu\n",allocSize);
      fprintf(stderr,"numPixels = %zu\n",numPixels);
      fprintf(stderr,"width = %zu\n",imageInfo->width);
      fprintf(stderr,"height = %zu\n",imageInfo->height);
      fprintf(stderr,"depth = %zu\n",imageInfo->depth);
      fprintf(stderr,"rowPitch = %zu\n",imageInfo->rowPitch);
      fprintf(stderr,"slicePitch = %zu\n",imageInfo->slicePitch);
      fprintf(stderr,"arraySize = %zu\n",imageInfo->arraySize);
      fprintf(stderr,"explicit_type_size = %zu\n",get_explicit_type_size(dataType));
    }
#endif

#if defined(__APPLE__)
    char *data = NULL;
    if (gDeviceType == CL_DEVICE_TYPE_CPU)
    {
        size_t mapSize =
            ((allocSize + 4095L) & -4096L) + 8192; // alloc two extra pages.

        void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE,
                         MAP_ANON | MAP_PRIVATE, 0, 0);
        if (map == MAP_FAILED)
        {
            perror("create_random_image_data: mmap");
            log_error("%s:%d: mmap failed, mapSize = %zu\n", __FILE__, __LINE__,
                      mapSize);
        }
        intptr_t data_end = (intptr_t)map + mapSize - 4096;
        data = (char *)(data_end - (intptr_t)allocSize);

        mprotect(map, 4096, PROT_NONE);
        mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
        P.reset(data, map, mapSize);
    }
    else
    {
        data = (char *)malloc(allocSize);
        P.reset(data);
    }
#else
    char *data =
        (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format));
    P.reset(data, NULL, 0, allocSize, true);
#endif

    if (data == NULL)
    {
        log_error(
            "ERROR: Unable to malloc %zu bytes for create_random_image_data\n",
            allocSize);
        return NULL;
    }

    switch (dataType)
    {
        case kFloat: {
            float *inputValues = (float *)data;
            switch (imageInfo->format->image_channel_data_type)
            {
                case CL_HALF_FLOAT: {
                    // Generate data that is (mostly) inside the range of a half
                    // float const float HALF_MIN = 5.96046448e-08f;
                    const float HALF_MAX = 65504.0f;

                    size_t i = 0;
                    inputValues[i++] = 0.f;
                    inputValues[i++] = 1.f;
                    inputValues[i++] = -1.f;
                    inputValues[i++] = 2.f;
                    for (; i < numPixels * 4; i++)
                        inputValues[i] = get_random_float(-HALF_MAX - 2.f,
                                                          HALF_MAX + 2.f, d);
                }
                break;
#ifdef CL_SFIXED14_APPLE
                case CL_SFIXED14_APPLE: {
                    size_t i = 0;
                    if (numPixels * 4 >= 8)
                    {
                        inputValues[i++] = INFINITY;
                        inputValues[i++] = 0x1.0p14f;
                        inputValues[i++] = 0x1.0p31f;
                        inputValues[i++] = 0x1.0p32f;
                        inputValues[i++] = -INFINITY;
                        inputValues[i++] = -0x1.0p14f;
                        inputValues[i++] = -0x1.0p31f;
                        inputValues[i++] = -0x1.1p31f;
                    }
                    for (; i < numPixels * 4; i++)
                        inputValues[i] = get_random_float(-1.1f, 3.1f, d);
                }
                break;
#endif
                case CL_FLOAT: {
                    size_t i = 0;
                    inputValues[i++] = INFINITY;
                    inputValues[i++] = -INFINITY;
                    inputValues[i++] = 0.0f;
                    inputValues[i++] = 0.0f;
                    cl_uint *p = (cl_uint *)data;
                    for (; i < numPixels * 4; i++) p[i] = genrand_int32(d);
                }
                break;

                default:
                    size_t i = 0;
                    if (numPixels * 4 >= 36)
                    {
                        inputValues[i++] = 0.0f;
                        inputValues[i++] = 0.5f;
                        inputValues[i++] = 31.5f;
                        inputValues[i++] = 32.0f;
                        inputValues[i++] = 127.5f;
                        inputValues[i++] = 128.0f;
                        inputValues[i++] = 255.5f;
                        inputValues[i++] = 256.0f;
                        inputValues[i++] = 1023.5f;
                        inputValues[i++] = 1024.0f;
                        inputValues[i++] = 32767.5f;
                        inputValues[i++] = 32768.0f;
                        inputValues[i++] = 65535.5f;
                        inputValues[i++] = 65536.0f;
                        inputValues[i++] = 2147483648.0f;
                        inputValues[i++] = 4294967296.0f;
                        inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p63f, 1, 63);
                        inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p64f, 1, 64);
                        inputValues[i++] = -0.0f;
                        inputValues[i++] = -0.5f;
                        inputValues[i++] = -31.5f;
                        inputValues[i++] = -32.0f;
                        inputValues[i++] = -127.5f;
                        inputValues[i++] = -128.0f;
                        inputValues[i++] = -255.5f;
                        inputValues[i++] = -256.0f;
                        inputValues[i++] = -1023.5f;
                        inputValues[i++] = -1024.0f;
                        inputValues[i++] = -32767.5f;
                        inputValues[i++] = -32768.0f;
                        inputValues[i++] = -65535.5f;
                        inputValues[i++] = -65536.0f;
                        inputValues[i++] = -2147483648.0f;
                        inputValues[i++] = -4294967296.0f;
                        inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p63f, 1, 63);
                        inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p64f, 1, 64);
                    }
                    if (is_format_signed(imageInfo->format))
                    {
                        for (; i < numPixels * 4; i++)
                            inputValues[i] = get_random_float(-1.1f, 1.1f, d);
                    }
                    else
                    {
                        for (; i < numPixels * 4; i++)
                            inputValues[i] = get_random_float(-0.1f, 1.1f, d);
                    }
                    break;
            }
            break;
        }

        case kInt: {
            int *imageData = (int *)data;

            // We want to generate ints (mostly) in range of the target format
            int formatMin = get_format_min_int(imageInfo->format);
            size_t formatMax = get_format_max_int(imageInfo->format);
            if (formatMin == 0)
            {
                // Unsigned values, but we are only an int, so cap the actual
                // max at the max of signed ints
                if (formatMax > 2147483647L) formatMax = 2147483647L;
            }
            // If the final format is small enough, give us a bit of room for
            // out-of-range values to test
            if (formatMax < 2147483647L) formatMax += 2;
            if (formatMin > -2147483648LL) formatMin -= 2;

            // Now gen
            for (size_t i = 0; i < numPixels * 4; i++)
            {
                imageData[i] = random_in_range(formatMin, (int)formatMax, d);
            }
            break;
        }

        case kUInt:
        case kUnsignedInt: {
            unsigned int *imageData = (unsigned int *)data;

            // We want to generate ints (mostly) in range of the target format
            int formatMin = get_format_min_int(imageInfo->format);
            size_t formatMax = get_format_max_int(imageInfo->format);
            if (formatMin < 0) formatMin = 0;
            // If the final format is small enough, give us a bit of room for
            // out-of-range values to test
            if (formatMax < 4294967295LL) formatMax += 2;

            // Now gen
            for (size_t i = 0; i < numPixels * 4; i++)
            {
                imageData[i] = random_in_range(formatMin, (int)formatMax, d);
            }
            break;
        }
        default:
            // Unsupported source format
            delete[] data;
            return NULL;
    }

    return data;
}

/*
    deprecated
bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t
max, int &outValue )
{
    int v = (int)value;

    switch(imageSampler->addressing_mode)
    {
        case CL_ADDRESS_REPEAT:
            outValue = v;
            while( v < 0 )
                v += (int)max;
            while( v >= (int)max )
                v -= (int)max;
            if( v != outValue )
            {
                outValue = v;
                return true;
            }
            return false;

        case CL_ADDRESS_MIRRORED_REPEAT:
            log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do
we ever use this? exit(-1);

        default:
            if( v < 0 )
            {
                outValue = 0;
                return true;
            }
            if( v >= (int)max )
            {
                outValue = (int)max - 1;
                return true;
            }
            outValue = v;
            return false;
    }

}
*/

void get_sampler_kernel_code(image_sampler_data *imageSampler, char *outLine)
{
    const char *normalized;
    const char *addressMode;
    const char *filterMode;

    if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP)
        addressMode = "CLK_ADDRESS_CLAMP";
    else if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE)
        addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE";
    else if (imageSampler->addressing_mode == CL_ADDRESS_REPEAT)
        addressMode = "CLK_ADDRESS_REPEAT";
    else if (imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT)
        addressMode = "CLK_ADDRESS_MIRRORED_REPEAT";
    else if (imageSampler->addressing_mode == CL_ADDRESS_NONE)
        addressMode = "CLK_ADDRESS_NONE";
    else
    {
        log_error("**Error: Unknown addressing mode! Aborting...\n");
        abort();
    }

    if (imageSampler->normalized_coords)
        normalized = "CLK_NORMALIZED_COORDS_TRUE";
    else
        normalized = "CLK_NORMALIZED_COORDS_FALSE";

    if (imageSampler->filter_mode == CL_FILTER_LINEAR)
        filterMode = "CLK_FILTER_LINEAR";
    else
        filterMode = "CLK_FILTER_NEAREST";

    sprintf(outLine, "    const sampler_t imageSampler = %s | %s | %s;\n",
            addressMode, filterMode, normalized);
}

void copy_image_data(image_descriptor *srcImageInfo,
                     image_descriptor *dstImageInfo, void *imageValues,
                     void *destImageValues, const size_t sourcePos[],
                     const size_t destPos[], const size_t regionSize[])
{
    //  assert( srcImageInfo->format == dstImageInfo->format );

    size_t src_mip_level_offset = 0, dst_mip_level_offset = 0;
    size_t sourcePos_lod[3], destPos_lod[3], src_lod, dst_lod;
    size_t src_row_pitch_lod, src_slice_pitch_lod;
    size_t dst_row_pitch_lod, dst_slice_pitch_lod;

    size_t pixelSize = get_pixel_size(srcImageInfo->format);

    sourcePos_lod[0] = sourcePos[0];
    sourcePos_lod[1] = sourcePos[1];
    sourcePos_lod[2] = sourcePos[2];
    destPos_lod[0] = destPos[0];
    destPos_lod[1] = destPos[1];
    destPos_lod[2] = destPos[2];
    src_row_pitch_lod = srcImageInfo->rowPitch;
    dst_row_pitch_lod = dstImageInfo->rowPitch;
    src_slice_pitch_lod = srcImageInfo->slicePitch;
    dst_slice_pitch_lod = dstImageInfo->slicePitch;

    if (srcImageInfo->num_mip_levels > 1)
    {
        size_t src_width_lod = 1 /*srcImageInfo->width*/;
        size_t src_height_lod = 1 /*srcImageInfo->height*/;
        size_t src_depth_lod = 1 /*srcImageInfo->depth*/;

        switch (srcImageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE1D:
                src_lod = sourcePos[1];
                sourcePos_lod[1] = sourcePos_lod[2] = 0;
                src_width_lod = (srcImageInfo->width >> src_lod)
                    ? (srcImageInfo->width >> src_lod)
                    : 1;
                break;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            case CL_MEM_OBJECT_IMAGE2D:
                src_lod = sourcePos[2];
                sourcePos_lod[1] = sourcePos[1];
                sourcePos_lod[2] = 0;
                src_width_lod = (srcImageInfo->width >> src_lod)
                    ? (srcImageInfo->width >> src_lod)
                    : 1;
                if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE2D)
                    src_height_lod = (srcImageInfo->height >> src_lod)
                        ? (srcImageInfo->height >> src_lod)
                        : 1;
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            case CL_MEM_OBJECT_IMAGE3D:
                src_lod = sourcePos[3];
                sourcePos_lod[1] = sourcePos[1];
                sourcePos_lod[2] = sourcePos[2];
                src_width_lod = (srcImageInfo->width >> src_lod)
                    ? (srcImageInfo->width >> src_lod)
                    : 1;
                src_height_lod = (srcImageInfo->height >> src_lod)
                    ? (srcImageInfo->height >> src_lod)
                    : 1;
                if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE3D)
                    src_depth_lod = (srcImageInfo->depth >> src_lod)
                        ? (srcImageInfo->depth >> src_lod)
                        : 1;
                break;
        }
        src_mip_level_offset = compute_mip_level_offset(srcImageInfo, src_lod);
        src_row_pitch_lod =
            src_width_lod * get_pixel_size(srcImageInfo->format);
        src_slice_pitch_lod = src_row_pitch_lod * src_height_lod;
    }

    if (dstImageInfo->num_mip_levels > 1)
    {
        size_t dst_width_lod = 1 /*dstImageInfo->width*/;
        size_t dst_height_lod = 1 /*dstImageInfo->height*/;
        size_t dst_depth_lod = 1 /*dstImageInfo->depth*/;
        switch (dstImageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE1D:
                dst_lod = destPos[1];
                destPos_lod[1] = destPos_lod[2] = 0;
                dst_width_lod = (dstImageInfo->width >> dst_lod)
                    ? (dstImageInfo->width >> dst_lod)
                    : 1;
                break;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            case CL_MEM_OBJECT_IMAGE2D:
                dst_lod = destPos[2];
                destPos_lod[1] = destPos[1];
                destPos_lod[2] = 0;
                dst_width_lod = (dstImageInfo->width >> dst_lod)
                    ? (dstImageInfo->width >> dst_lod)
                    : 1;
                if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D)
                    dst_height_lod = (dstImageInfo->height >> dst_lod)
                        ? (dstImageInfo->height >> dst_lod)
                        : 1;
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            case CL_MEM_OBJECT_IMAGE3D:
                dst_lod = destPos[3];
                destPos_lod[1] = destPos[1];
                destPos_lod[2] = destPos[2];
                dst_width_lod = (dstImageInfo->width >> dst_lod)
                    ? (dstImageInfo->width >> dst_lod)
                    : 1;
                dst_height_lod = (dstImageInfo->height >> dst_lod)
                    ? (dstImageInfo->height >> dst_lod)
                    : 1;
                if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D)
                    dst_depth_lod = (dstImageInfo->depth >> dst_lod)
                        ? (dstImageInfo->depth >> dst_lod)
                        : 1;
                break;
        }
        dst_mip_level_offset = compute_mip_level_offset(dstImageInfo, dst_lod);
        dst_row_pitch_lod =
            dst_width_lod * get_pixel_size(dstImageInfo->format);
        dst_slice_pitch_lod = dst_row_pitch_lod * dst_height_lod;
    }

    // Get initial pointers
    char *sourcePtr = (char *)imageValues
        + sourcePos_lod[2] * src_slice_pitch_lod
        + sourcePos_lod[1] * src_row_pitch_lod + pixelSize * sourcePos_lod[0]
        + src_mip_level_offset;
    char *destPtr = (char *)destImageValues
        + destPos_lod[2] * dst_slice_pitch_lod
        + destPos_lod[1] * dst_row_pitch_lod + pixelSize * destPos_lod[0]
        + dst_mip_level_offset;

    for (size_t z = 0; z < (regionSize[2] > 0 ? regionSize[2] : 1); z++)
    {
        char *rowSourcePtr = sourcePtr;
        char *rowDestPtr = destPtr;
        for (size_t y = 0; y < regionSize[1]; y++)
        {
            memcpy(rowDestPtr, rowSourcePtr, pixelSize * regionSize[0]);
            rowSourcePtr += src_row_pitch_lod;
            rowDestPtr += dst_row_pitch_lod;
        }

        sourcePtr += src_slice_pitch_lod;
        destPtr += dst_slice_pitch_lod;
    }
}

float random_float(float low, float high, MTdata d)
{
    float t = (float)genrand_real1(d);
    return (1.0f - t) * low + t * high;
}

CoordWalker::CoordWalker(void *coords, bool useFloats, size_t vecSize)
{
    if (useFloats)
    {
        mFloatCoords = (cl_float *)coords;
        mIntCoords = NULL;
    }
    else
    {
        mFloatCoords = NULL;
        mIntCoords = (cl_int *)coords;
    }
    mVecSize = vecSize;
}

CoordWalker::~CoordWalker() {}

cl_float CoordWalker::Get(size_t idx, size_t el)
{
    if (mIntCoords != NULL)
        return (cl_float)mIntCoords[idx * mVecSize + el];
    else
        return mFloatCoords[idx * mVecSize + el];
}


void print_read_header(const cl_image_format *format,
                       image_sampler_data *sampler, bool err, int t)
{
    const char *addressMode = NULL;
    const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" };

    if (sampler->addressing_mode == CL_ADDRESS_CLAMP)
        addressMode = "CL_ADDRESS_CLAMP";
    else if (sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE)
        addressMode = "CL_ADDRESS_CLAMP_TO_EDGE";
    else if (sampler->addressing_mode == CL_ADDRESS_REPEAT)
        addressMode = "CL_ADDRESS_REPEAT";
    else if (sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT)
        addressMode = "CL_ADDRESS_MIRRORED_REPEAT";
    else
        addressMode = "CL_ADDRESS_NONE";

    if (t)
    {
        if (err)
            log_error("[%-7s %-24s %d] - %s - %s - %s - %s\n",
                      GetChannelOrderName(format->image_channel_order),
                      GetChannelTypeName(format->image_channel_data_type),
                      (int)get_format_channel_count(format),
                      sampler->filter_mode == CL_FILTER_NEAREST
                          ? "CL_FILTER_NEAREST"
                          : "CL_FILTER_LINEAR",
                      addressMode,
                      normalizedNames[sampler->normalized_coords ? 1 : 0],
                      t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED");
        else
            log_info("[%-7s %-24s %d] - %s - %s - %s - %s\n",
                     GetChannelOrderName(format->image_channel_order),
                     GetChannelTypeName(format->image_channel_data_type),
                     (int)get_format_channel_count(format),
                     sampler->filter_mode == CL_FILTER_NEAREST
                         ? "CL_FILTER_NEAREST"
                         : "CL_FILTER_LINEAR",
                     addressMode,
                     normalizedNames[sampler->normalized_coords ? 1 : 0],
                     t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED");
    }
    else
    {
        if (err)
            log_error("[%-7s %-24s %d] - %s - %s - %s\n",
                      GetChannelOrderName(format->image_channel_order),
                      GetChannelTypeName(format->image_channel_data_type),
                      (int)get_format_channel_count(format),
                      sampler->filter_mode == CL_FILTER_NEAREST
                          ? "CL_FILTER_NEAREST"
                          : "CL_FILTER_LINEAR",
                      addressMode,
                      normalizedNames[sampler->normalized_coords ? 1 : 0]);
        else
            log_info("[%-7s %-24s %d] - %s - %s - %s\n",
                     GetChannelOrderName(format->image_channel_order),
                     GetChannelTypeName(format->image_channel_data_type),
                     (int)get_format_channel_count(format),
                     sampler->filter_mode == CL_FILTER_NEAREST
                         ? "CL_FILTER_NEAREST"
                         : "CL_FILTER_LINEAR",
                     addressMode,
                     normalizedNames[sampler->normalized_coords ? 1 : 0]);
    }
}

void print_write_header(const cl_image_format *format, bool err = false)
{
    if (err)
        log_error("[%-7s %-24s %d]\n",
                  GetChannelOrderName(format->image_channel_order),
                  GetChannelTypeName(format->image_channel_data_type),
                  (int)get_format_channel_count(format));
    else
        log_info("[%-7s %-24s %d]\n",
                 GetChannelOrderName(format->image_channel_order),
                 GetChannelTypeName(format->image_channel_data_type),
                 (int)get_format_channel_count(format));
}


void print_header(const cl_image_format *format, bool err = false)
{
    if (err)
    {
        log_error("[%-7s %-24s %d]\n",
                  GetChannelOrderName(format->image_channel_order),
                  GetChannelTypeName(format->image_channel_data_type),
                  (int)get_format_channel_count(format));
    }
    else
    {
        log_info("[%-7s %-24s %d]\n",
                 GetChannelOrderName(format->image_channel_order),
                 GetChannelTypeName(format->image_channel_data_type),
                 (int)get_format_channel_count(format));
    }
}

bool find_format(cl_image_format *formatList, unsigned int numFormats,
                 cl_image_format *formatToFind)
{
    for (unsigned int i = 0; i < numFormats; i++)
    {
        if (formatList[i].image_channel_order
                == formatToFind->image_channel_order
            && formatList[i].image_channel_data_type
                == formatToFind->image_channel_data_type)
            return true;
    }
    return false;
}

void build_required_image_formats(
    cl_mem_flags flags, cl_mem_object_type image_type, cl_device_id device,
    std::vector<cl_image_format> &formatsToSupport)
{
    formatsToSupport.clear();

    // Minimum list of supported image formats for reading or writing (embedded
    // profile)
    static std::vector<cl_image_format> embeddedProfile_readOrWrite{
        // clang-format off
        { CL_RGBA, CL_UNORM_INT8 },
        { CL_RGBA, CL_UNORM_INT16 },
        { CL_RGBA, CL_SIGNED_INT8 },
        { CL_RGBA, CL_SIGNED_INT16 },
        { CL_RGBA, CL_SIGNED_INT32 },
        { CL_RGBA, CL_UNSIGNED_INT8 },
        { CL_RGBA, CL_UNSIGNED_INT16 },
        { CL_RGBA, CL_UNSIGNED_INT32 },
        { CL_RGBA, CL_HALF_FLOAT },
        { CL_RGBA, CL_FLOAT },
        // clang-format on
    };

    // Minimum list of required image formats for reading or writing
    // num_channels, for all image types.
    static std::vector<cl_image_format> fullProfile_readOrWrite{
        // clang-format off
        { CL_RGBA, CL_UNORM_INT8 },
        { CL_RGBA, CL_UNORM_INT16 },
        { CL_RGBA, CL_SIGNED_INT8 },
        { CL_RGBA, CL_SIGNED_INT16 },
        { CL_RGBA, CL_SIGNED_INT32 },
        { CL_RGBA, CL_UNSIGNED_INT8 },
        { CL_RGBA, CL_UNSIGNED_INT16 },
        { CL_RGBA, CL_UNSIGNED_INT32 },
        { CL_RGBA, CL_HALF_FLOAT },
        { CL_RGBA, CL_FLOAT },
        { CL_BGRA, CL_UNORM_INT8 },
        // clang-format on
    };

    // Minimum list of supported image formats for reading or writing
    // (OpenCL 2.0, 2.1, or 2.2), for all image types.
    static std::vector<cl_image_format> fullProfile_2x_readOrWrite{
        // clang-format off
        { CL_R, CL_UNORM_INT8 },
        { CL_R, CL_UNORM_INT16 },
        { CL_R, CL_SNORM_INT8 },
        { CL_R, CL_SNORM_INT16 },
        { CL_R, CL_SIGNED_INT8 },
        { CL_R, CL_SIGNED_INT16 },
        { CL_R, CL_SIGNED_INT32 },
        { CL_R, CL_UNSIGNED_INT8 },
        { CL_R, CL_UNSIGNED_INT16 },
        { CL_R, CL_UNSIGNED_INT32 },
        { CL_R, CL_HALF_FLOAT },
        { CL_R, CL_FLOAT },
        { CL_RG, CL_UNORM_INT8 },
        { CL_RG, CL_UNORM_INT16 },
        { CL_RG, CL_SNORM_INT8 },
        { CL_RG, CL_SNORM_INT16 },
        { CL_RG, CL_SIGNED_INT8 },
        { CL_RG, CL_SIGNED_INT16 },
        { CL_RG, CL_SIGNED_INT32 },
        { CL_RG, CL_UNSIGNED_INT8 },
        { CL_RG, CL_UNSIGNED_INT16 },
        { CL_RG, CL_UNSIGNED_INT32 },
        { CL_RG, CL_HALF_FLOAT },
        { CL_RG, CL_FLOAT },
        { CL_RGBA, CL_UNORM_INT8 },
        { CL_RGBA, CL_UNORM_INT16 },
        { CL_RGBA, CL_SNORM_INT8 },
        { CL_RGBA, CL_SNORM_INT16 },
        { CL_RGBA, CL_SIGNED_INT8 },
        { CL_RGBA, CL_SIGNED_INT16 },
        { CL_RGBA, CL_SIGNED_INT32 },
        { CL_RGBA, CL_UNSIGNED_INT8 },
        { CL_RGBA, CL_UNSIGNED_INT16 },
        { CL_RGBA, CL_UNSIGNED_INT32 },
        { CL_RGBA, CL_HALF_FLOAT },
        { CL_RGBA, CL_FLOAT },
        { CL_BGRA, CL_UNORM_INT8 },
        // clang-format on
    };

    // Conditional addition to the 2x readOrWrite table:
    // Support for the CL_DEPTH image channel order is required only for 2D
    // images and 2D image arrays.
    static std::vector<cl_image_format> fullProfile_2x_readOrWrite_Depth{
        // clang-format off
        { CL_DEPTH, CL_UNORM_INT16 },
        { CL_DEPTH, CL_FLOAT },
        // clang-format on
    };

    // Conditional addition to the 2x readOrWrite table:
    // Support for reading from the CL_sRGBA image channel order is optional for
    // 1D image buffers. Support for writing to the CL_sRGBA image channel order
    // is optional for all image types.
    static std::vector<cl_image_format> fullProfile_2x_readOrWrite_srgb{
        { CL_sRGBA, CL_UNORM_INT8 },
    };

    // Minimum list of required image formats for reading and writing.
    static std::vector<cl_image_format> fullProfile_readAndWrite{
        // clang-format off
        { CL_R, CL_UNORM_INT8 },
        { CL_R, CL_SIGNED_INT8 },
        { CL_R, CL_SIGNED_INT16 },
        { CL_R, CL_SIGNED_INT32 },
        { CL_R, CL_UNSIGNED_INT8 },
        { CL_R, CL_UNSIGNED_INT16 },
        { CL_R, CL_UNSIGNED_INT32 },
        { CL_R, CL_HALF_FLOAT },
        { CL_R, CL_FLOAT },
        { CL_RGBA, CL_UNORM_INT8 },
        { CL_RGBA, CL_SIGNED_INT8 },
        { CL_RGBA, CL_SIGNED_INT16 },
        { CL_RGBA, CL_SIGNED_INT32 },
        { CL_RGBA, CL_UNSIGNED_INT8 },
        { CL_RGBA, CL_UNSIGNED_INT16 },
        { CL_RGBA, CL_UNSIGNED_INT32 },
        { CL_RGBA, CL_HALF_FLOAT },
        { CL_RGBA, CL_FLOAT },
        // clang-format on
    };

    // Embedded profile
    if (gIsEmbedded)
    {
        copy(embeddedProfile_readOrWrite.begin(),
             embeddedProfile_readOrWrite.end(),
             back_inserter(formatsToSupport));
    }
    // Full profile
    else
    {
        Version version = get_device_cl_version(device);
        if (version < Version(2, 0) || version >= Version(3, 0))
        {
            // Full profile, OpenCL 1.2 or 3.0.
            if (flags & CL_MEM_KERNEL_READ_AND_WRITE)
            {
                // Note: assumes that read-write images are supported!
                copy(fullProfile_readAndWrite.begin(),
                     fullProfile_readAndWrite.end(),
                     back_inserter(formatsToSupport));
            }
            else
            {
                copy(fullProfile_readOrWrite.begin(),
                     fullProfile_readOrWrite.end(),
                     back_inserter(formatsToSupport));
            }
        }
        else
        {
            // Full profile, OpenCL 2.0, 2.1, 2.2.
            if (flags & CL_MEM_KERNEL_READ_AND_WRITE)
            {
                copy(fullProfile_readAndWrite.begin(),
                     fullProfile_readAndWrite.end(),
                     back_inserter(formatsToSupport));
            }
            else
            {
                copy(fullProfile_2x_readOrWrite.begin(),
                     fullProfile_2x_readOrWrite.end(),
                     back_inserter(formatsToSupport));

                // Support for the CL_DEPTH image channel order is required only
                // for 2D images and 2D image arrays.
                if (image_type == CL_MEM_OBJECT_IMAGE2D
                    || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
                {
                    copy(fullProfile_2x_readOrWrite_Depth.begin(),
                         fullProfile_2x_readOrWrite_Depth.end(),
                         back_inserter(formatsToSupport));
                }

                // Support for reading from the CL_sRGBA image channel order is
                // optional for 1D image buffers. Support for writing to the
                // CL_sRGBA image channel order is optional for all image types.
                if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER
                    && flags == CL_MEM_READ_ONLY)
                {
                    copy(fullProfile_2x_readOrWrite_srgb.begin(),
                         fullProfile_2x_readOrWrite_srgb.end(),
                         back_inserter(formatsToSupport));
                }
            }
        }
    }
}

bool is_image_format_required(cl_image_format format, cl_mem_flags flags,
                              cl_mem_object_type image_type,
                              cl_device_id device)
{
    std::vector<cl_image_format> formatsToSupport;
    build_required_image_formats(flags, image_type, device, formatsToSupport);

    for (auto &formatItr : formatsToSupport)
    {
        if (formatItr.image_channel_order == format.image_channel_order
            && formatItr.image_channel_data_type
                == format.image_channel_data_type)
        {
            return true;
        }
    }

    return false;
}

cl_uint compute_max_mip_levels(size_t width, size_t height, size_t depth)
{
    cl_uint retMaxMipLevels = 0;
    size_t max_dim = 0;

    max_dim = width;
    max_dim = height > max_dim ? height : max_dim;
    max_dim = depth > max_dim ? depth : max_dim;

    while (max_dim)
    {
        retMaxMipLevels++;
        max_dim >>= 1;
    }
    return retMaxMipLevels;
}

cl_ulong compute_mipmapped_image_size(image_descriptor imageInfo)
{
    cl_ulong retSize = 0;
    size_t curr_width, curr_height, curr_depth, curr_array_size;
    curr_width = imageInfo.width;
    curr_height = imageInfo.height;
    curr_depth = imageInfo.depth;
    curr_array_size = imageInfo.arraySize;

    for (int i = 0; i < (int)imageInfo.num_mip_levels; i++)
    {
        switch (imageInfo.type)
        {
            case CL_MEM_OBJECT_IMAGE3D:
                retSize += (cl_ulong)curr_width * curr_height * curr_depth
                    * get_pixel_size(imageInfo.format);
                break;
            case CL_MEM_OBJECT_IMAGE2D:
                retSize += (cl_ulong)curr_width * curr_height
                    * get_pixel_size(imageInfo.format);
                break;
            case CL_MEM_OBJECT_IMAGE1D:
                retSize +=
                    (cl_ulong)curr_width * get_pixel_size(imageInfo.format);
                break;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                retSize += (cl_ulong)curr_width * curr_array_size
                    * get_pixel_size(imageInfo.format);
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                retSize += (cl_ulong)curr_width * curr_height * curr_array_size
                    * get_pixel_size(imageInfo.format);
                break;
        }

        switch (imageInfo.type)
        {
            case CL_MEM_OBJECT_IMAGE3D:
                curr_depth = curr_depth >> 1 ? curr_depth >> 1 : 1;
            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                curr_height = curr_height >> 1 ? curr_height >> 1 : 1;
            case CL_MEM_OBJECT_IMAGE1D:
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                curr_width = curr_width >> 1 ? curr_width >> 1 : 1;
        }
    }

    return retSize;
}

size_t compute_mip_level_offset(image_descriptor *imageInfo, size_t lod)
{
    size_t retOffset = 0;
    size_t width, height, depth;
    width = imageInfo->width;
    height = imageInfo->height;
    depth = imageInfo->depth;

    for (size_t i = 0; i < lod; i++)
    {
        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                retOffset += (size_t)width * height * imageInfo->arraySize
                    * get_pixel_size(imageInfo->format);
                break;
            case CL_MEM_OBJECT_IMAGE3D:
                retOffset += (size_t)width * height * depth
                    * get_pixel_size(imageInfo->format);
                break;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                retOffset += (size_t)width * imageInfo->arraySize
                    * get_pixel_size(imageInfo->format);
                break;
            case CL_MEM_OBJECT_IMAGE2D:
                retOffset +=
                    (size_t)width * height * get_pixel_size(imageInfo->format);
                break;
            case CL_MEM_OBJECT_IMAGE1D:
                retOffset += (size_t)width * get_pixel_size(imageInfo->format);
                break;
        }

        // Compute next lod dimensions
        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE3D: depth = (depth >> 1) ? (depth >> 1) : 1;
            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                height = (height >> 1) ? (height >> 1) : 1;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            case CL_MEM_OBJECT_IMAGE1D: width = (width >> 1) ? (width >> 1) : 1;
        }
    }
    return retOffset;
}

const char *convert_image_type_to_string(cl_mem_object_type image_type)
{
    switch (image_type)
    {
        case CL_MEM_OBJECT_IMAGE1D: return "1D";
        case CL_MEM_OBJECT_IMAGE2D: return "2D";
        case CL_MEM_OBJECT_IMAGE3D: return "3D";
        case CL_MEM_OBJECT_IMAGE1D_ARRAY: return "1D array";
        case CL_MEM_OBJECT_IMAGE2D_ARRAY: return "2D array";
        case CL_MEM_OBJECT_IMAGE1D_BUFFER: return "1D image buffer";
        default: return "unrecognized object type";
    }
}