OpenCL-CTS/test_conformance/images/image_helpers.cpp

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "image_helpers.h"
#include <math.h>
#include <limits.h>
#include <float.h>
#if defined( __APPLE__ )
#include <sys/mman.h>
#endif


int gTestCount = 0;
int gTestFailure = 0;
RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;

static cl_ushort float2half_rte( float f );
static cl_ushort float2half_rtz( float f );


cl_channel_type  get_channel_type_from_name( const char *name )
{
    struct {
        cl_channel_type type;
        const char *name;
    } typeNames[] = {
        { CL_SNORM_INT8, "CL_SNORM_INT8" },
        { CL_SNORM_INT16, "CL_SNORM_INT16" },
        { CL_UNORM_INT8, "CL_UNORM_INT8" },
        { CL_UNORM_INT16, "CL_UNORM_INT16" },
        { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" },
        { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" },
        { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" },
        { CL_SIGNED_INT8, "CL_SIGNED_INT8" },
        { CL_SIGNED_INT16, "CL_SIGNED_INT16" },
        { CL_SIGNED_INT32, "CL_SIGNED_INT32" },
        { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" },
        { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" },
        { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" },
        { CL_HALF_FLOAT, "CL_HALF_FLOAT" },
        { CL_FLOAT, "CL_FLOAT" },
#ifdef CL_SFIXED14_APPLE
        { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" }
#endif
    };
    for( size_t i = 0; i < sizeof( typeNames ) / sizeof( typeNames[ 0 ] ); i++ )
    {
        if( strcmp( typeNames[ i ].name, name ) == 0 || strcmp( typeNames[ i ].name + 3, name ) == 0 )
            return typeNames[ i ].type;
    }
    return (cl_channel_type)-1;
}

cl_channel_order  get_channel_order_from_name( const char *name )
{
    const struct
    {
        cl_channel_order    order;
        const char          *name;
    }orderNames[] =
    {
        { CL_R, "CL_R" },
        { CL_A, "CL_A" },
        { CL_Rx, "CL_Rx" },
        { CL_RG, "CL_RG" },
        { CL_RA, "CL_RA" },
        { CL_RGx, "CL_RGx" },
        { CL_RGB, "CL_RGB" },
        { CL_RGBx, "CL_RGBx" },
        { CL_RGBA, "CL_RGBA" },
        { CL_BGRA, "CL_BGRA" },
        { CL_ARGB, "CL_ARGB" },
        { CL_INTENSITY, "CL_INTENSITY"},
        { CL_LUMINANCE, "CL_LUMINANCE"},
#ifdef CL_1RGB_APPLE
        { CL_1RGB_APPLE, "CL_1RGB_APPLE" },
#endif
#ifdef CL_BGR1_APPLE
        { CL_BGR1_APPLE, "CL_BGR1_APPLE" },
#endif
    };

    for( size_t i = 0; i < sizeof( orderNames ) / sizeof( orderNames[ 0 ] ); i++ )
    {
        if( strcmp( orderNames[ i ].name, name ) == 0 || strcmp( orderNames[ i ].name + 3, name ) == 0 )
            return orderNames[ i ].order;
    }
    return (cl_channel_order)-1;
}

int random_log_in_range( int minV, int maxV, MTdata d  )
{
    double v = log2( ( (double)genrand_int32(d) / (double)0x7fffffff ) + 1 );
    int iv = (int)( (float)( maxV - minV ) * v );
    return iv + minV;
}


// Define the addressing functions
typedef int (*AddressFn)( int value, size_t maxValue );

int         NoAddressFn( int value, size_t maxValue )               { return value; }
int         RepeatAddressFn( int value, size_t maxValue )
{
    if( value < 0 )
        value += (int)maxValue;
    else if( value >= (int)maxValue )
        value -= (int)maxValue;
    return value;
}
int         MirroredRepeatAddressFn( int value, size_t maxValue )
{
    if( value < 0 )
        value  = 0;
    else if( (size_t) value >= maxValue )
        value = (int) (maxValue - 1);
    return value;
}
int         ClampAddressFn( int value, size_t maxValue )            { return ( value < -1 ) ? -1 : ( ( value > (cl_long) maxValue ) ? (int)maxValue : value ); }
int         ClampToEdgeNearestFn( int value, size_t maxValue )  { return ( value < 0 ) ? 0 : ( ( (size_t)value > maxValue - 1 ) ? (int)maxValue - 1 : value ); }
AddressFn   ClampToEdgeLinearFn                                                 = ClampToEdgeNearestFn;

// Note: normalized coords get repeated in normalized space, not unnormalized space! hence the special case here
volatile float gFloatHome;
float           RepeatNormalizedAddressFn( float fValue, size_t maxValue )
{
#if !defined( __i386__ ) && !defined( __x86_64__ ) // Use original if not the x86 compiler.
    // General computation for repeat
    return (fValue - floorf( fValue )) * (float) maxValue; // Reduce to [0, 1.f]
#else // Otherwise, use this instead:
    // Home the subtraction to a float to break up the sequence of x87
    // instructions emitted by the VS compiler.
    gFloatHome = fValue - floorf(fValue);
    return gFloatHome * (float)maxValue;
#endif
}

float           MirroredRepeatNormalizedAddressFn( float fValue, size_t maxValue )
{
    // Round to nearest multiple of two
    float s_prime = 2.0f * rintf( fValue * 0.5f );        // Note halfway values flip flop here due to rte, but they both end up pointing the same place at the end of the day

    // Reduce to [-1, 1], Apply mirroring -> [0, 1]
    s_prime = fabsf( fValue - s_prime );

    // un-normalize
    return s_prime * (float) maxValue;
}

struct AddressingTable
{
    AddressingTable()
    {
        ct_assert( ( CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6 ) );
        ct_assert( CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2 );

        mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]            = NoAddressFn;
        mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]             = NoAddressFn;
        mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]          = RepeatAddressFn;
        mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]           = RepeatAddressFn;
        mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]   = ClampToEdgeNearestFn;
        mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]    = ClampToEdgeLinearFn;
        mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]           = ClampAddressFn;
        mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]            = ClampAddressFn;
        mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = MirroredRepeatAddressFn;
        mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]  = MirroredRepeatAddressFn;
    }

    AddressFn operator[]( image_sampler_data *sampler )
    {
        return mTable[ (int)sampler->addressing_mode - CL_ADDRESS_NONE ][ (int)sampler->filter_mode - CL_FILTER_NEAREST ];
    }

    AddressFn mTable[ 6 ][ 2 ];
};

static AddressingTable  sAddressingTable;


// Format helpers

int has_alpha(cl_image_format *format) {
    switch (format->image_channel_order) {
        case CL_R:
            return 0;
        case CL_A:
        case CL_Rx:
            return 1;
        case CL_RG:
            return 0;
        case CL_RA:
        case CL_RGx:
            return 1;
        case CL_RGB:
            return 0;
        case CL_RGBx:
            return 1;
        case CL_RGBA:
            return 1;
        case CL_BGRA:
            return 1;
        case CL_ARGB:
            return 1;
        case CL_INTENSITY:
            return 1;
        case CL_LUMINANCE:
            return 0;
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE: return 1;
#endif
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE: return 1;
#endif
        default:
            log_error("Invalid image channel order: %d\n", format->image_channel_order);
            return 0;
    }

}

#define PRINT_MAX_SIZE_LOGIC 0

#define SWAP( _a, _b )      do{ _a ^= _b; _b ^= _a; _a ^= _b; }while(0)
#ifndef MAX
    #define MAX( _a, _b )   ((_a) > (_b) ? (_a) : (_b))
#endif

void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
                   size_t sizes[][3], size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize,
                   const cl_ulong maxIndividualAllocSize,       // CL_DEVICE_MAX_MEM_ALLOC_SIZE
                   const cl_ulong maxTotalAllocSize,            // CL_DEVICE_GLOBAL_MEM_SIZE
                   cl_mem_object_type image_type, cl_image_format *format) {

    bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D);
    bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY);

    // Validate we have a reasonable max depth for 3D
    if (is3D && maxDepth < 2) {
        log_error("ERROR: Requesting max image sizes for 3D images when max depth is < 2.\n");
        *numberOfSizes = 0;
        return;
    }
    // Validate we have a reasonable max array size for 1D & 2D image arrays
    if (isArray && maxArraySize < 2) {
        log_error("ERROR: Requesting max image sizes for an image array when max array size is < 1.\n");
        *numberOfSizes = 0;
        return;
    }

    // Reduce the maximum because we are trying to test the max image dimensions, not the memory allocation
    cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4;
    cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4;
    log_info("Note: max individual allocation adjusted down from %gMB to %gMB and max total allocation adjusted down from %gMB to %gMB.\n",
             maxIndividualAllocSize/(1024.0*1024.0), adjustedMaxIndividualAllocSize/(1024.0*1024.0),
             maxTotalAllocSize/(1024.0*1024.0), adjustedMaxTotalAllocSize/(1024.0*1024.0));

    // Cap our max allocation to 1.5GB.
    // FIXME -- why?  In the interest of not taking a long time?  We should still test this stuff...
    if (adjustedMaxTotalAllocSize > (cl_ulong)2048*1024*1024) {
        adjustedMaxTotalAllocSize = (cl_ulong)2048*1024*1024;
        log_info("Limiting max total allocation size to %gMB (down from %gMB) for test.\n",
                 adjustedMaxTotalAllocSize/(1024.0*1024.0), maxTotalAllocSize/(1024.0*1024.0));
    }

    cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize;
    if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize*2)
        maxAllocSize = adjustedMaxTotalAllocSize/2;

    size_t raw_pixel_size = get_pixel_size(format);
    size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;

    log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, per-allocation limit %gMB.\n",
             maxWidth, maxHeight, isArray ? maxArraySize : maxDepth, raw_pixel_size, (maxAllocSize/(1024.0*1024.0)));

  // Keep track of the maximum sizes for each dimension
  size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth };

  switch (image_type) {
    case CL_MEM_OBJECT_IMAGE1D_ARRAY:
      maximum_sizes[1] = maxArraySize;
      maximum_sizes[2] = 1;
      break;
    case CL_MEM_OBJECT_IMAGE2D_ARRAY:
      maximum_sizes[2] = maxArraySize;
      break;
  }


  // Given one fixed sized dimension, this code finds one or two other dimensions,
  // both with very small size, such that the size does not exceed the maximum
  // passed to this function

  size_t other_sizes[] = { 2, 7, 13, 18, 21, 29, 33, 36 };
  static size_t other_size = 0;
  enum { num_other_sizes = sizeof(other_sizes)/sizeof(size_t) };

  (*numberOfSizes) = 0;

  if (image_type == CL_MEM_OBJECT_IMAGE1D) {

    double M = maximum_sizes[0];

    // Store the size
    sizes[(*numberOfSizes)][0] = (size_t)M;
    sizes[(*numberOfSizes)][1] = 1;
    sizes[(*numberOfSizes)][2] = 1;
    ++(*numberOfSizes);
  }

  else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D) {

    for (int fixed_dim=0;fixed_dim<2;++fixed_dim) {

      // Determine the size of the fixed dimension
      double M = maximum_sizes[fixed_dim];
      double A = max_pixels;

      int x0_dim = !fixed_dim;
      double x0  = fmin(fmin(other_sizes[(other_size++)%num_other_sizes],A/M), maximum_sizes[x0_dim]);

      // Store the size
      sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
      sizes[(*numberOfSizes)][x0_dim]    = (size_t)x0;
      sizes[(*numberOfSizes)][2]         = 1;
      ++(*numberOfSizes);
    }
  }

  else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE3D) {

    // Iterate over dimensions, finding sizes for the non-fixed dimension
    for (int fixed_dim=0;fixed_dim<3;++fixed_dim) {

      // Determine the size of the fixed dimension
      double M = maximum_sizes[fixed_dim];
      double A = max_pixels;

      // Find two other dimensions, x0 and x1
      int x0_dim = (fixed_dim == 0) ? 1 : 0;
      int x1_dim = (fixed_dim == 2) ? 1 : 2;

      // Choose two other sizes for these dimensions
      double x0 = fmin(fmin(A/M,maximum_sizes[x0_dim]),other_sizes[(other_size++)%num_other_sizes]);
      // GPUs have certain restrictions on minimum width (row alignment) of images which has given us issues
      // testing small widths in this test (say we set width to 3 for testing, and compute size based on this width and decide
      // it fits within vram ... but GPU driver decides that, due to row alignment requirements, it has to use
      // width of 16 which doesnt fit in vram). For this purpose we are not testing width < 16 for this test.
      if(x0_dim == 0 && x0 < 16)
        x0 = 16;
      double x1 = fmin(fmin(A/M/x0,maximum_sizes[x1_dim]),other_sizes[(other_size++)%num_other_sizes]);
      // Store the size
      sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
      sizes[(*numberOfSizes)][x0_dim]    = (size_t)x0;
      sizes[(*numberOfSizes)][x1_dim]    = (size_t)x1;
      ++(*numberOfSizes);
    }
  }

  // Log the results
  for (int j=0; j<(int)(*numberOfSizes); j++) {
    switch (image_type) {
      case CL_MEM_OBJECT_IMAGE1D:
        log_info(" size[%d] = [%ld] (%g MB image)\n",
                 j, sizes[j][0], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
        break;
      case CL_MEM_OBJECT_IMAGE1D_ARRAY:
      case CL_MEM_OBJECT_IMAGE2D:
        log_info(" size[%d] = [%ld %ld] (%g MB image)\n",
                 j, sizes[j][0], sizes[j][1], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
        break;
      case CL_MEM_OBJECT_IMAGE2D_ARRAY:
      case CL_MEM_OBJECT_IMAGE3D:
        log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n",
                 j, sizes[j][0], sizes[j][1], sizes[j][2], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
        break;
    }
  }
}

float get_max_absolute_error( cl_image_format *format, image_sampler_data *sampler) {
    if (sampler->filter_mode == CL_FILTER_NEAREST)
        return 0.0f;

    switch (format->image_channel_data_type) {
        case CL_SNORM_INT8:
            return 1.0f/127.0f;
        case CL_UNORM_INT8:
            return 1.0f/255.0f;
        case CL_UNORM_INT16:
            return 1.0f/65535.0f;
        case CL_SNORM_INT16:
            return 1.0f/32767.0f;
        case CL_FLOAT:
            return CL_FLT_MIN;
#ifdef  CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
            return 0x1.0p-14f;
#endif
        default:
            return 0.0f;
    }
}

float get_max_relative_error( cl_image_format *format, image_sampler_data *sampler, int is3D, int isLinearFilter )
{
    float maxError = 0.0f;
    float sampleCount = 1.0f;
    if( isLinearFilter )
        sampleCount =  is3D ? 8.0f : 4.0f;

    // Note that the ULP is defined here as the unit in the last place of the maximum
    // magnitude sample used for filtering.

    // Section 8.3
    switch( format->image_channel_data_type )
    {
            // The spec allows 2 ulps of error for normalized formats
        case CL_SNORM_INT8:
        case CL_UNORM_INT8:
        case CL_SNORM_INT16:
        case CL_UNORM_INT16:
        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
        case CL_UNORM_INT_101010:
            maxError = 2*FLT_EPSILON*sampleCount;       // Maximum sampling error for round to zero normalization based on multiplication
            // by reciprocal (using reciprocal generated in round to +inf mode, so that 1.0 matches spec)
            break;

            // If the implementation supports these formats then it will have to allow rounding error here too,
            // because not all 32-bit ints are exactly representable in float
        case CL_SIGNED_INT32:
        case CL_UNSIGNED_INT32:
            maxError = 1*FLT_EPSILON;
            break;
    }


    // Section 8.2
    if( sampler->addressing_mode == CL_ADDRESS_REPEAT || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT || sampler->filter_mode != CL_FILTER_NEAREST || sampler->normalized_coords )
#if defined( __APPLE__ )
    {
        if( sampler->filter_mode != CL_FILTER_NEAREST )
        {
            extern cl_device_type   gDeviceType;
            // The maximum
            if( gDeviceType == CL_DEVICE_TYPE_GPU )
                maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4);              // Some GPUs ain't so accurate
            else
                // The standard method of 2d linear filtering delivers 4.0 ulps of error in round to nearest (8 in rtz).
                maxError += 4.0f * FLT_EPSILON;
        }
        else
            maxError += 4.0f * FLT_EPSILON;    // normalized coordinates will introduce some error into the fractional part of the address, affecting results
    }
#else
    {
#if !defined(_WIN32)
#warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec
#endif
        // The spec allows linear filters to return any result most of the time.
        // That's fine for implementations but a problem for testing. After all
        // users aren't going to like garbage images.  We have "picked a number"
        // here that we are going to attempt to conform to. Implementations are
        // free to pick another number, like infinity, if they like.
        // We picked a number for you, to provide /some/ sanity
        maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7);
        // ...but this is what the spec allows:
        // maxError = INFINITY;
        // Please feel free to pick any positive number. (NaN wont work.)
    }
#endif

    // The error calculation itself can introduce error
    maxError += FLT_EPSILON * 2;

    return maxError;
}

size_t get_format_max_int( cl_image_format *format )
{
    switch( format->image_channel_data_type )
    {
        case CL_SNORM_INT8:
        case CL_SIGNED_INT8:
            return 127;
        case CL_UNORM_INT8:
        case CL_UNSIGNED_INT8:
            return 255;

        case CL_SNORM_INT16:
        case CL_SIGNED_INT16:
            return 32767;

        case CL_UNORM_INT16:
        case CL_UNSIGNED_INT16:
            return 65535;

        case CL_SIGNED_INT32:
            return 2147483647L;

        case CL_UNSIGNED_INT32:
            return 4294967295LL;

        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
            return 31;

        case CL_UNORM_INT_101010:
            return 1023;

        case CL_HALF_FLOAT:
            return 1<<10;

#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
            return 16384;
#endif
        default:
            return 0;
    }
}

int get_format_min_int( cl_image_format *format )
{
    switch( format->image_channel_data_type )
    {
        case CL_SNORM_INT8:
        case CL_SIGNED_INT8:
            return -128;
        case CL_UNORM_INT8:
        case CL_UNSIGNED_INT8:
            return 0;

        case CL_SNORM_INT16:
        case CL_SIGNED_INT16:
            return -32768;

        case CL_UNORM_INT16:
        case CL_UNSIGNED_INT16:
            return 0;

        case CL_SIGNED_INT32:
            return -2147483648LL;

        case CL_UNSIGNED_INT32:
            return 0;

        case CL_UNORM_SHORT_565:
        case CL_UNORM_SHORT_555:
        case CL_UNORM_INT_101010:
            return 0;

        case CL_HALF_FLOAT:
            return -1<<10;

#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
            return -16384;
#endif

        default:
            return 0;
    }
}

float convert_half_to_float( unsigned short halfValue )
{
    // We have to take care of a few special cases, but in general, we just extract
    // the same components from the half that exist in the float and re-stuff them
    // For a description of the actual half format, see http://en.wikipedia.org/wiki/Half_precision
    // Note: we store these in 32-bit ints to make the bit manipulations easier later
    int sign =     ( halfValue >> 15 ) & 0x0001;
    int exponent = ( halfValue >> 10 ) & 0x001f;
    int mantissa = ( halfValue )       & 0x03ff;

    // Note: we use a union here to be able to access the bits of a float directly
    union
    {
        unsigned int bits;
        float floatValue;
    } outFloat;

    // Special cases first
    if( exponent == 0 )
    {
        if( mantissa == 0 )
        {
            // If both exponent and mantissa are 0, the number is +/- 0
            outFloat.bits  = sign << 31;
            return outFloat.floatValue; // Already done!
        }

        // If exponent is 0, it's a denormalized number, so we renormalize it
        // Note: this is not terribly efficient, but oh well
        while( ( mantissa & 0x00000400 ) == 0 )
        {
            mantissa <<= 1;
            exponent--;
        }

        // The first bit is implicit, so we take it off and inc the exponent accordingly
        exponent++;
        mantissa &= ~(0x00000400);
    }
    else if( exponent == 31 ) // Special-case "numbers"
    {
        // If the exponent is 31, it's a special case number (+/- infinity or NAN).
        // If the mantissa is 0, it's infinity, else it's NAN, but in either case, the packing
        // method is the same
        outFloat.bits = ( sign << 31 ) | 0x7f800000 | ( mantissa << 13 );
        return outFloat.floatValue;
    }

    // Plain ol' normalized number, so adjust to the ranges a 32-bit float expects and repack
    exponent += ( 127 - 15 );
    mantissa <<= 13;

    outFloat.bits = ( sign << 31 ) | ( exponent << 23 ) | mantissa;
    return outFloat.floatValue;
}


cl_ushort convert_float_to_half( float f )
{
    switch( gFloatToHalfRoundingMode )
    {
        case kRoundToNearestEven:
            return float2half_rte( f );
        case kRoundTowardZero:
            return float2half_rtz( f );
        default:
            log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" );
            exit(-1);
            return 0xffff;
    }

}

cl_ushort float2half_rte( float f )
    {
    union{ float f; cl_uint u; } u = {f};
    cl_uint sign = (u.u >> 16) & 0x8000;
    float x = fabsf(f);

    //Nan
    if( x != x )
    {
        u.u >>= (24-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
                }

    // overflow
    if( x >= MAKE_HEX_FLOAT(0x1.ffep15f, 0x1ffeL, 3) )
        return 0x7c00 | sign;

    // underflow
    if( x <= MAKE_HEX_FLOAT(0x1.0p-25f, 0x1L, -25) )
        return sign;    // The halfway case can return 0x0001 or 0. 0 is even.

    // very small
    if( x < MAKE_HEX_FLOAT(0x1.8p-24f, 0x18L, -28) )
        return sign | 1;

    // half denormal
    if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
    {
        u.f = x * MAKE_HEX_FLOAT(0x1.0p-125f, 0x1L, -125);
        return sign | u.u;
        }

    u.f *= MAKE_HEX_FLOAT(0x1.0p13f, 0x1L, 13);
    u.u &= 0x7f800000;
    x += u.f;
    u.f = x - u.f;
    u.f *= MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);

    return (u.u >> (24-11)) | sign;
    }

cl_ushort float2half_rtz( float f )
    {
    union{ float f; cl_uint u; } u = {f};
    cl_uint sign = (u.u >> 16) & 0x8000;
    float x = fabsf(f);

    //Nan
    if( x != x )
        {
        u.u >>= (24-11);
        u.u &= 0x7fff;
        u.u |= 0x0200;      //silence the NaN
        return u.u | sign;
        }

    // overflow
    if( x >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) )
        {
        if( x == INFINITY )
            return 0x7c00 | sign;

        return 0x7bff | sign;
        }

    // underflow
    if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
        return sign;    // The halfway case can return 0x0001 or 0. 0 is even.

    // half denormal
    if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
    {
        x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
        return (cl_ushort)((int) x | sign);
    }

    u.u &= 0xFFFFE000U;
    u.u -= 0x38000000U;

    return (u.u >> (24-11)) | sign;
}

class TEST
{
public:
    TEST();
};

static TEST t;
void  __vstore_half_rte(float f, size_t index, uint16_t *p)
{
    union{ unsigned int u; float f;} u;

    u.f = f;
    unsigned short r = (u.u >> 16) & 0x8000;
    u.u &= 0x7fffffff;
    if( u.u >= 0x33000000U )
    {
        if( u.u >= 0x47800000 )
        {
            if( u.u <= 0x7f800000 )
                r |= 0x7c00;
            else
            {
                r |= 0x7e00 | ( (u.u >> 13) & 0x3ff );
            }
        }
        else
        {
            float x = u.f;
            if( u.u < 0x38800000 )
                u.u = 0x3f000000;
            else
                u.u += 0x06800000;
            u.u &= 0x7f800000U;
            x += u.f;
            x -= u.f;
            u.f = x * MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);
            u.u >>= 13;
            r |= (unsigned short) u.u;
        }
    }

    ((unsigned short*)p)[index] = r;
}

TEST::TEST()
{
    return;
    union
    {
        float f;
        uint32_t i;
    } test;
    uint16_t control, myval;

    log_info(" &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" );
    test.i = 0;
    do
    {
        if( ( test.i & 0xffffff ) == 0 )
        {
            if( ( test.i & 0xfffffff ) == 0 )
                log_info( "*" );
            else
                log_info( "." );
            fflush(stdout);
        }
        __vstore_half_rte( test.f, 0, &control );
        myval = convert_float_to_half( test.f );
        if( myval != control )
        {
            log_info( "\n******** ERROR: MyVal %04x control %04x source %12.24f\n", myval, control, test.f );
            log_info( "         source bits: %08x   %a\n", test.i, test.f );
            float t, c;
            c = convert_half_to_float( control );
            t = convert_half_to_float( myval );
            log_info( "         converted control: %12.24f myval: %12.24f\n", c, t );
        }
        test.i++;
    } while( test.i != 0 );
    log_info("\n &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" );

}

extern bool gTestRounding;
uint64_t gRoundingStartValue = 0;

char * generate_random_image_data( image_descriptor *imageInfo, BufferOwningPtr<char> &P, MTdata d )
{
    size_t allocSize;
    size_t pixelRowBytes = imageInfo->width * get_pixel_size( imageInfo->format );
    size_t i;

    switch (imageInfo->type)
    {
        case CL_MEM_OBJECT_IMAGE1D:
            allocSize = imageInfo->rowPitch;
            break;
        case CL_MEM_OBJECT_IMAGE2D:
            allocSize = imageInfo->height * imageInfo->rowPitch;
            break;
        case CL_MEM_OBJECT_IMAGE3D:
            allocSize = imageInfo->depth * imageInfo->slicePitch;
            break;
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            allocSize = imageInfo->arraySize * imageInfo->slicePitch;
            break;
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            allocSize = imageInfo->arraySize * imageInfo->slicePitch;
            break;
        default:
            log_error("Cannot identify image type %x", imageInfo->type);
            return 0;
    }

#if defined (__APPLE__ )
    char *data = NULL;
    if (gDeviceType == CL_DEVICE_TYPE_CPU) {
        size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192;

        void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
        intptr_t data_end = (intptr_t)map + mapSize - 4096;
        data = (char *)(data_end - (intptr_t)allocSize);

        mprotect(map, 4096, PROT_NONE);
        mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
        P.reset(data, map, mapSize,allocSize);
    } else {
        data = (char *)malloc(allocSize);
        P.reset(data,NULL,0,allocSize);
    }
#else
    char *data = (char *)malloc(allocSize);
    P.reset(data,NULL,0,allocSize);
#endif

    if (data == NULL) {
      log_error( "ERROR: Unable to malloc %lu bytes for generate_random_image_data\n", allocSize );
      return 0;
    }

    if( gTestRounding )
    {
        // Special case: fill with a ramp from 0 to the size of the type
        size_t typeSize = get_format_type_size( imageInfo->format );
        switch( typeSize )
        {
            case 1:
            {
                char *ptr = data;
                for( i = 0; i < allocSize; i++ )
                    ptr[i] = (cl_char) (i + gRoundingStartValue);
            }
                break;
            case 2:
            {
                cl_short *ptr = (cl_short*) data;
                for( i = 0; i < allocSize / 2; i++ )
                    ptr[i] = (cl_short) (i +  gRoundingStartValue);
            }
                break;
            case 4:
            {
                cl_int *ptr = (cl_int*) data;
                for( i = 0; i < allocSize / 2; i++ )
                    ptr[i] = (cl_int) (i +  gRoundingStartValue);
            }
                break;
        }
        return data;
    }

    // Otherwise, we should be able to just fill with random bits no matter what
    cl_uint *p = (cl_uint*) data;
    for( i = 0; i + 4 <= allocSize; i += 4 )
        p[ i / 4 ] = genrand_int32(d);

    for( ; i < allocSize; i++ )
        data[i] = genrand_int32(d);

    // Note: inf or nan float values would cause problems, although we don't know this will
    // actually be a float, so we just know what to look for
    unsigned int *intPtr = (unsigned int *)data;
    for( i = 0; i < allocSize >> 2; i++ )
    {
        if( ( intPtr[ i ] & 0x7F800000 ) == 0x7F800000 )
            intPtr[ i ] ^= 0x40000000;
    }

    // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits = 0x7C00 are special)
    unsigned short *shortPtr = (unsigned short *)data;
    for( i = 0; i < allocSize >> 1; i++ )
    {
        if( ( shortPtr[ i ] & 0x7C00 ) == 0x7C00 )
            shortPtr[ i ] ^= 0x4000;
    }

    // Fill unused edges with -1, NaN for float
    if (imageInfo->rowPitch > pixelRowBytes)
    {
        size_t height = 0;

        switch (imageInfo->type)
        {
            case CL_MEM_OBJECT_IMAGE2D:
            case CL_MEM_OBJECT_IMAGE3D:
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                height = imageInfo->height;
                break;
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                height = imageInfo->arraySize;
                break;
          }

          // Fill in the row padding regions
          for( i = 0; i < height; i++ )
          {
              size_t offset = i * imageInfo->rowPitch + pixelRowBytes;
              size_t length = imageInfo->rowPitch - pixelRowBytes;
              memset( data + offset, 0xff, length );
          }
    }

    // Fill in the slice padding regions, if necessary:

    size_t slice_dimension = imageInfo->height;
    if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
        slice_dimension = imageInfo->arraySize;
    }

    if (imageInfo->slicePitch > slice_dimension*imageInfo->rowPitch)
    {
        size_t depth = 0;
        switch (imageInfo->type)
        {
          case CL_MEM_OBJECT_IMAGE2D:
          case CL_MEM_OBJECT_IMAGE3D:
              depth = imageInfo->depth;
              break;
          case CL_MEM_OBJECT_IMAGE1D_ARRAY:
          case CL_MEM_OBJECT_IMAGE2D_ARRAY:
              depth = imageInfo->arraySize;
              break;
        }

        for( i = 0; i < depth; i++ )
        {
            size_t offset = i * imageInfo->slicePitch + slice_dimension*imageInfo->rowPitch;
            size_t length = imageInfo->slicePitch - slice_dimension*imageInfo->rowPitch;
            memset( data + offset, 0xff, length );
        }
    }

    return data;
}

#define CLAMP_FLOAT( v ) ( fmaxf( fminf( v, 1.f ), -1.f ) )


void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
                            int x, int y, int z, float *outData )
{
    if ( x < 0 || y < 0 || z < 0 || x >= (int)imageInfo->width
               || ( imageInfo->height != 0 && y >= (int)imageInfo->height )
               || ( imageInfo->depth != 0 && z >= (int)imageInfo->depth )
               || ( imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize ) )
    {
        // Border color
        outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = outData[ 3 ] = 0;
        if (!has_alpha(imageInfo->format))
            outData[3] = 1;
        return;
    }

    cl_image_format *format = imageInfo->format;

    unsigned int i;
    float tempData[ 4 ];

    // Advance to the right spot
    char *ptr = (char *)imageData;
    size_t pixelSize = get_pixel_size( format );

    ptr += z * imageInfo->slicePitch + y * imageInfo->rowPitch + x * pixelSize;

    // OpenCL only supports reading floats from certain formats
    size_t channelCount = get_format_channel_count( format );
    switch( format->image_channel_data_type )
    {
        case CL_SNORM_INT8:
        {
            char *dPtr = (char *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 127.0f );
            break;
        }

        case CL_UNORM_INT8:
        {
            unsigned char *dPtr = (unsigned char *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float)dPtr[ i ] / 255.0f;
            break;
        }

        case CL_SIGNED_INT8:
        {
            cl_char *dPtr = (cl_char *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] =  (float)dPtr[ i ];
            break;
        }

        case CL_UNSIGNED_INT8:
        {
            cl_uchar *dPtr = (cl_uchar *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float) dPtr[ i ];
            break;
        }

        case CL_SNORM_INT16:
        {
            cl_short *dPtr = (cl_short *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 32767.0f );
            break;
        }

        case CL_UNORM_INT16:
        {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float)dPtr[ i ] / 65535.0f;
            break;
        }

        case CL_SIGNED_INT16:
        {
            cl_short *dPtr = (cl_short *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float)dPtr[ i ];
            break;
        }

        case CL_UNSIGNED_INT16:
        {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float) dPtr[ i ];
            break;
        }

        case CL_HALF_FLOAT:
        {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = convert_half_to_float( dPtr[ i ] );
            break;
        }

        case CL_SIGNED_INT32:
        {
            cl_int *dPtr = (cl_int *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float)dPtr[ i ];
            break;
        }

        case CL_UNSIGNED_INT32:
        {
            cl_uint *dPtr = (cl_uint *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float)dPtr[ i ];
            break;
        }

        case CL_UNORM_SHORT_565:
        {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            tempData[ 0 ] = (float)( dPtr[ 0 ] >> 11 ) / (float)31;
            tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 63 ) / (float)63;
            tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31;
            break;
        }

        case CL_UNORM_SHORT_555:
        {
            cl_ushort *dPtr = (cl_ushort *)ptr;
            tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 31 ) / (float)31;
            tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 31 ) / (float)31;
            tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31;
            break;
        }

        case CL_UNORM_INT_101010:
        {
            cl_uint *dPtr = (cl_uint *)ptr;
            tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 20 ) & 0x3ff ) / (float)1023;
            tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 0x3ff ) / (float)1023;
            tempData[ 2 ] = (float)( dPtr[ 0 ] & 0x3ff ) / (float)1023;
            break;
        }

        case CL_FLOAT:
        {
            float *dPtr = (float *)ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[ i ] = (float)dPtr[ i ];
            break;
        }
#ifdef  CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
        {
            cl_ushort *dPtr = (cl_ushort*) ptr;
            for( i = 0; i < channelCount; i++ )
                tempData[i] = ((int) dPtr[i] - 16384) * 0x1.0p-14f;
            break;
        }
#endif
    }


    outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = 0;
    outData[ 3 ] = 1;

    switch( format->image_channel_order )
    {
        case CL_A:
            outData[ 3 ] = tempData[ 0 ];
            break;
        case CL_R:
        case CL_Rx:
            outData[ 0 ] = tempData[ 0 ];
            break;
        case CL_RA:
            outData[ 0 ] = tempData[ 0 ];
            outData[ 3 ] = tempData[ 1 ];
            break;
        case CL_RG:
        case CL_RGx:
            outData[ 0 ] = tempData[ 0 ];
            outData[ 1 ] = tempData[ 1 ];
            break;
        case CL_RGB:
        case CL_RGBx:
            outData[ 0 ] = tempData[ 0 ];
            outData[ 1 ] = tempData[ 1 ];
            outData[ 2 ] = tempData[ 2 ];
            break;
        case CL_RGBA:
            outData[ 0 ] = tempData[ 0 ];
            outData[ 1 ] = tempData[ 1 ];
            outData[ 2 ] = tempData[ 2 ];
            outData[ 3 ] = tempData[ 3 ];
            break;
        case CL_ARGB:
            outData[ 0 ] = tempData[ 1 ];
            outData[ 1 ] = tempData[ 2 ];
            outData[ 2 ] = tempData[ 3 ];
            outData[ 3 ] = tempData[ 0 ];
            break;
        case CL_BGRA:
            outData[ 0 ] = tempData[ 2 ];
            outData[ 1 ] = tempData[ 1 ];
            outData[ 2 ] = tempData[ 0 ];
            outData[ 3 ] = tempData[ 3 ];
            break;
        case CL_INTENSITY:
            outData[ 0 ] = tempData[ 0 ];
            outData[ 1 ] = tempData[ 0 ];
            outData[ 2 ] = tempData[ 0 ];
            outData[ 3 ] = tempData[ 0 ];
            break;
        case CL_LUMINANCE:
            outData[ 0 ] = tempData[ 0 ];
            outData[ 1 ] = tempData[ 0 ];
            outData[ 2 ] = tempData[ 0 ];
            break;
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE:
            outData[ 0 ] = tempData[ 1 ];
            outData[ 1 ] = tempData[ 2 ];
            outData[ 2 ] = tempData[ 3 ];
            outData[ 3 ] = 1.0f;
            break;
#endif
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE:
            outData[ 0 ] = tempData[ 2 ];
            outData[ 1 ] = tempData[ 1 ];
            outData[ 2 ] = tempData[ 0 ];
            outData[ 3 ] = 1.0f;
            break;
#endif
        default:
            log_error("Invalid format:");
            print_header(format, true);
            break;
    }
}

bool get_integer_coords( float x, float y, float z, size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ ) {
    return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height, depth, imageSampler, imageInfo, outX, outY, outZ);
}

bool get_integer_coords_offset( float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
                               size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ )
{
    AddressFn adFn = sAddressingTable[ imageSampler ];

    float refX = floorf( x ), refY = floorf( y ), refZ = floorf( z );

    if( imageSampler->normalized_coords )
    {
        switch (imageSampler->addressing_mode)
        {
            case CL_ADDRESS_REPEAT:
                x = RepeatNormalizedAddressFn( x, width );
                if (height != 0) {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                        y = RepeatNormalizedAddressFn( y, height );
                }
                if (depth != 0) {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
                        z = RepeatNormalizedAddressFn( z, depth );
                }

                if (xAddressOffset != 0.0) {
                    // Add in the offset
                    x += xAddressOffset;
                    // Handle wrapping
                    if (x > width)
                        x -= (float)width;
                    if (x < 0)
                        x += (float)width;
                }
                if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) {
                    // Add in the offset
                    y += yAddressOffset;
                    // Handle wrapping
                    if (y > height)
                        y -= (float)height;
                    if (y < 0)
                        y += (float)height;
                }
                if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) )  {
                    // Add in the offset
                    z += zAddressOffset;
                    // Handle wrapping
                    if (z > depth)
                        z -= (float)depth;
                    if (z < 0)
                        z += (float)depth;
                }
                break;

            case CL_ADDRESS_MIRRORED_REPEAT:
                x = MirroredRepeatNormalizedAddressFn( x, width );
                if (height != 0) {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                        y = MirroredRepeatNormalizedAddressFn( y, height );
                }
                if (depth != 0) {
                    if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
                        z = MirroredRepeatNormalizedAddressFn( z, depth );
                }

                if (xAddressOffset != 0.0)
                {
                    float temp = x + xAddressOffset;
                    if( temp > (float) width )
                        temp = (float) width - (temp - (float) width );
                    x = fabsf( temp );
                }
                if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) {
                    float temp = y + yAddressOffset;
                    if( temp > (float) height )
                        temp = (float) height - (temp - (float) height );
                    y = fabsf( temp );
                }
                if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) )  {
                    float temp = z + zAddressOffset;
                    if( temp > (float) depth )
                        temp = (float) depth - (temp - (float) depth );
                    z = fabsf( temp );
                }
                break;

            default:
                // Also, remultiply to the original coords. This simulates any truncation in
                // the pass to OpenCL
                x = (x * (float)width) + xAddressOffset;
                if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
                  y = (y * (float)height) + yAddressOffset;
                if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
                  z = (z * (float)depth) + zAddressOffset;
                break;
        }
    }

    // At this point, we're dealing with non-normalized coordinates.

    outX = adFn( floorf( x ), width );

    // 1D and 2D arrays require special care for the index coordinate:

    switch (imageInfo->type) {
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f);
            outZ = 0.0f; /* don't care! */
            break;
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            outY = adFn( floorf( y ), height );
            outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f);
            break;
        default:
            // legacy path:
            if (height != 0)
                outY = adFn( floorf( y ), height );
            if( depth != 0 )
                outZ = adFn( floorf( z ), depth );
    }


    return !( (int)refX == outX && (int)refY == outY && (int)refZ == outZ );
}

static float frac(float a) {
    return a - floorf(a);
}

static inline void pixelMax( const float a[4], const float b[4], float *results );
static inline void pixelMax( const float a[4], const float b[4], float *results )
{
    for( int i = 0; i < 4; i++ )
        results[i] = errMax( fabsf(a[i]), fabsf(b[i]) );
}

// If containsDenorms is NULL, flush denorms to zero
// if containsDenorms is not NULL, record whether there are any denorms
static inline void  check_for_denorms(float a[4], int *containsDenorms );
static inline void  check_for_denorms(float a[4], int *containsDenorms )
{
    if( NULL == containsDenorms )
    {
        for( int i = 0; i < 4; i++ )
        {
            if( fabsf(a[i]) < FLT_MIN )
                a[i] = copysignf( 0.0f, a[i] );
        }
    }
    else
    {
        for( int i = 0; i < 4; i++ )
        {
            if( fabs(a[i]) < FLT_MIN )
            {
                *containsDenorms = 1;
                break;
            }
        }
    }
}

inline float calculate_array_index( float coord, float extent ) {
    // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array'
    //
    // given coordinate 'w' that represents an index:
    // layer_index = clamp( floor(w + 0.5f), 0.0f, max_value_for_w )

    float ret = floorf( coord + 0.5f );
    ret = ret > extent ? extent : ret;
    ret = ret < 0.0f ? 0.0f : ret;

    return ret;
}

/*
 * Utility function to unnormalized a coordinate given a particular sampler.
 *
 * name     - the name of the coordinate, used for verbose debugging only
 * coord    - the coordinate requiring unnormalization
 * offset   - an addressing offset to be added to the coordinate
 * extent   - the max value for this coordinate (e.g. width for x)
 */
static float unnormalize_coordinate( const char* name, float coord,
    float offset, float extent, cl_addressing_mode addressing_mode, int verbose )
{
    float ret = 0.0f;

    switch (addressing_mode) {
        case CL_ADDRESS_REPEAT:
            ret = RepeatNormalizedAddressFn( coord, extent );

            if ( verbose ) {
                log_info( "\tRepeat filter denormalizes %s (%f) to %f\n",
                    name, coord, ret );
            }

            if (offset != 0.0) {
                // Add in the offset, and handle wrapping.
                ret += offset;
                if (ret > extent) ret -= extent;
                if (ret < 0.0) ret += extent;
            }

            if (verbose && offset != 0.0f) {
                log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
            }
            break;

        case CL_ADDRESS_MIRRORED_REPEAT:
            ret = MirroredRepeatNormalizedAddressFn( coord, extent );

            if ( verbose ) {
                log_info( "\tMirrored repeat filter denormalizes %s (%f) to %f\n",
                    name, coord, ret );
            }

            if (offset != 0.0) {
                float temp = ret + offset;
                if( temp > extent )
                    temp = extent - (temp - extent );
                ret = fabsf( temp );
            }

            if (verbose && offset != 0.0f) {
                log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
            }
            break;

        default:

            ret = coord * extent;

            if ( verbose ) {
                log_info( "\tFilter denormalizes %s (%f) to %f\n",
                    name, coord, ret );
            }

            ret += offset;

            if (verbose && offset != 0.0f) {
                log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
            }
    }

    return ret;
}

FloatPixel sample_image_pixel_float( void *imageData, image_descriptor *imageInfo,
                                    float x, float y, float z,
                                    image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms ) {
    return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms);
}

// returns max pixel value of the pixels touched
FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo,
                                           float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
                                           image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms )
{
    AddressFn adFn = sAddressingTable[ imageSampler ];
    FloatPixel returnVal;

    if( containsDenorms )
        *containsDenorms = 0;

    if( imageSampler->normalized_coords ) {

        // We need to unnormalize our coordinates differently depending on
        // the image type, but 'x' is always processed the same way.

        x = unnormalize_coordinate("x", x, xAddressOffset, (float)imageInfo->width,
            imageSampler->addressing_mode, verbose);

        switch (imageInfo->type) {

            // The image array types require special care:

            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                z = 0; // don't care -- unused for 1D arrays
                break;

            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                y = unnormalize_coordinate("y", y, yAddressOffset, (float)imageInfo->height,
                    imageSampler->addressing_mode, verbose);
                break;

            // Everybody else:

            default:
                y = unnormalize_coordinate("y", y, yAddressOffset, (float)imageInfo->height,
                    imageSampler->addressing_mode, verbose);
                z = unnormalize_coordinate("z", z, zAddressOffset, (float)imageInfo->depth,
                    imageSampler->addressing_mode, verbose);
        }

    } else if ( verbose ) {

        switch (imageInfo->type) {
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                log_info("Starting coordinate: %f, array index %f\n", x, y);
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                log_info("Starting coordinate: %f, %f, array index %f\n", x, y, z);
                break;
            case CL_MEM_OBJECT_IMAGE1D:
            case CL_MEM_OBJECT_IMAGE1D_BUFFER:
                log_info("Starting coordinate: %f\b", x);
                break;
            case CL_MEM_OBJECT_IMAGE2D:
                log_info("Starting coordinate: %f, %f\n", x, y);
                break;
            case CL_MEM_OBJECT_IMAGE3D:
            default:
                log_info("Starting coordinate: %f, %f, %f\n", x, y, z);
        }
    }

    // At this point, we have unnormalized coordinates.

    if( imageSampler->filter_mode == CL_FILTER_NEAREST )
    {
        int ix, iy, iz;

        // We apply the addressing function to the now-unnormalized
        // coordinates.  Note that the array cases again require special
        // care, per section 8.4 in the OpenCL 1.2 Specification.

        ix = adFn( floorf( x ), imageInfo->width );

        switch (imageInfo->type) {
            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
                iy = calculate_array_index( y, (float)(imageInfo->arraySize - 1) );
                iz = 0;
                break;
            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
                iy = adFn( floorf( y ), imageInfo->height );
                iz = calculate_array_index( z, (float)(imageInfo->arraySize - 1) );
                break;
            default:
                iy = adFn( floorf( y ), imageInfo->height );
                if( imageInfo->depth != 0 )
                    iz = adFn( floorf( z ), imageInfo->depth );
                else
                    iz = 0;
        }

        if( verbose ) {
            if( iz )
                log_info( "\tActual integer coords used (i = floor(x)): { %d, %d, %d }\n", ix, iy, iz );
            else
                log_info( "\tActual integer coords used (i = floor(x)): { %d, %d }\n", ix, iy );
        }

        read_image_pixel_float( imageData, imageInfo, ix, iy, iz, outData );
        check_for_denorms( outData, containsDenorms );
        for( int i = 0; i < 4; i++ )
            returnVal.p[i] = fabsf( outData[i] );
        return returnVal;
    }
    else
    {
        // Linear filtering cases.

        size_t width = imageInfo->width, height = imageInfo->height, depth = imageInfo->depth;

        // Image arrays can use 2D filtering, but require us to walk into the
        // image a certain number of slices before reading.

        if( depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY ||
                          imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
        {
            size_t layer_offset = 0;

            if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
                layer_offset = imageInfo->slicePitch * (size_t)calculate_array_index(
                    z, (float)(imageInfo->arraySize - 1)
                );
            }
            else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
                layer_offset = imageInfo->slicePitch * (size_t)calculate_array_index(
                    y, (float)(imageInfo->arraySize - 1)
                );

                // Set up y and height so that the filtering below is correct
                // 1D filtering on a single slice.
                height = 1;
            }

            int x1 = adFn( floorf( x - 0.5f ), width );
            int y1 = 0;
            int x2 = adFn( floorf( x - 0.5f ) + 1, width );
            int y2 = 0;
            if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D) &&
                (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
                (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
                y1 = adFn( floorf( y - 0.5f ), height );
                y2 = adFn( floorf( y - 0.5f ) + 1, height );
            } else {
              y = 0.5f;
            }

            if( verbose )
                log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{%d, %d } and i1:{%d, %d }\n", x1, y1, x2, y2 );

            // Walk to beginning of the 'correct' slice, if needed.
            char* imgPtr = ((char*)imageData) + layer_offset;

            float upLeft[ 4 ], upRight[ 4 ], lowLeft[ 4 ], lowRight[ 4 ];
            float maxUp[4], maxLow[4];
            read_image_pixel_float( imgPtr, imageInfo, x1, y1, 0, upLeft );
            read_image_pixel_float( imgPtr, imageInfo, x2, y1, 0, upRight );
            check_for_denorms( upLeft, containsDenorms );
            check_for_denorms( upRight, containsDenorms );
            pixelMax( upLeft, upRight, maxUp );
            read_image_pixel_float( imgPtr, imageInfo, x1, y2, 0, lowLeft );
            read_image_pixel_float( imgPtr, imageInfo, x2, y2, 0, lowRight );
            check_for_denorms( lowLeft, containsDenorms );
            check_for_denorms( lowRight, containsDenorms );
            pixelMax( lowLeft, lowRight, maxLow );
            pixelMax( maxUp, maxLow, returnVal.p );

            if( verbose )
            {
                if( NULL == containsDenorms )
                    log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" );
                else
                    log_info( "\tSampled pixels (rgba order):\n" );
                log_info( "\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1], upLeft[2], upLeft[3] );
                log_info( "\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1], upRight[2], upRight[3] );
                log_info( "\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1], lowLeft[2], lowLeft[3] );
                log_info( "\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1], lowRight[2], lowRight[3] );
            }

            bool printMe = false;
            if( x1 <= 0 || x2 <= 0 || x1 >= (int)width-1 || x2 >= (int)width-1 )
                printMe = true;
            if( y1 <= 0 || y2 <= 0 || y1 >= (int)height-1 || y2 >= (int)height-1 )
                printMe = true;

            double weights[ 2 ][ 2 ];

            weights[ 0 ][ 0 ] = weights[ 0 ][ 1 ] = 1.0 - frac( x - 0.5f );
            weights[ 1 ][ 0 ] = weights[ 1 ][ 1 ] = frac( x - 0.5f );
            weights[ 0 ][ 0 ] *= 1.0 - frac( y - 0.5f );
            weights[ 1 ][ 0 ] *= 1.0 - frac( y - 0.5f );
            weights[ 0 ][ 1 ] *= frac( y - 0.5f );
            weights[ 1 ][ 1 ] *= frac( y - 0.5f );

            if( verbose )
                log_info( "\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f\n",  frac( x - 0.5f ), frac( y - 0.5f ) );

            for( int i = 0; i < 4; i++ )
            {
                outData[ i ] = (float)( ( upLeft[ i ] * weights[ 0 ][ 0 ] ) +
                                       ( upRight[ i ] * weights[ 1 ][ 0 ] ) +
                                       ( lowLeft[ i ] * weights[ 0 ][ 1 ] ) +
                                       ( lowRight[ i ] * weights[ 1 ][ 1 ] ));

                // flush subnormal results to zero if necessary
                if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN )
                    outData[i] = copysignf( 0.0f, outData[i] );
            }
        }
        else
        {
            // 3D linear filtering
            int x1 = adFn( floorf( x - 0.5f ), width );
            int y1 = adFn( floorf( y - 0.5f ), height );
            int z1 = adFn( floorf( z - 0.5f ), depth );
            int x2 = adFn( floorf( x - 0.5f ) + 1, width );
            int y2 = adFn( floorf( y - 0.5f ) + 1, height );
            int z2 = adFn( floorf( z - 0.5f ) + 1, depth );

            if( verbose )
                log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{%d, %d, %d} and i1:{%d, %d, %d}\n", x1, y1, z1, x2, y2, z2 );

            float upLeftA[ 4 ], upRightA[ 4 ], lowLeftA[ 4 ], lowRightA[ 4 ];
            float upLeftB[ 4 ], upRightB[ 4 ], lowLeftB[ 4 ], lowRightB[ 4 ];
            float pixelMaxA[4], pixelMaxB[4];
            read_image_pixel_float( imageData, imageInfo, x1, y1, z1, upLeftA );
            read_image_pixel_float( imageData, imageInfo, x2, y1, z1, upRightA );
            check_for_denorms( upLeftA, containsDenorms );
            check_for_denorms( upRightA, containsDenorms );
            pixelMax( upLeftA, upRightA, pixelMaxA );
            read_image_pixel_float( imageData, imageInfo, x1, y2, z1, lowLeftA );
            read_image_pixel_float( imageData, imageInfo, x2, y2, z1, lowRightA );
            check_for_denorms( lowLeftA, containsDenorms );
            check_for_denorms( lowRightA, containsDenorms );
            pixelMax( lowLeftA, lowRightA, pixelMaxB );
            pixelMax( pixelMaxA, pixelMaxB, returnVal.p);
            read_image_pixel_float( imageData, imageInfo, x1, y1, z2, upLeftB );
            read_image_pixel_float( imageData, imageInfo, x2, y1, z2, upRightB );
            check_for_denorms( upLeftB, containsDenorms );
            check_for_denorms( upRightB, containsDenorms );
            pixelMax( upLeftB, upRightB, pixelMaxA );
            read_image_pixel_float( imageData, imageInfo, x1, y2, z2, lowLeftB );
            read_image_pixel_float( imageData, imageInfo, x2, y2, z2, lowRightB );
            check_for_denorms( lowLeftB, containsDenorms );
            check_for_denorms( lowRightB, containsDenorms );
            pixelMax( lowLeftB, lowRightB, pixelMaxB );
            pixelMax( pixelMaxA, pixelMaxB, pixelMaxA);
            pixelMax( pixelMaxA, returnVal.p, returnVal.p );

            if( verbose )
            {
                if( NULL == containsDenorms )
                    log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" );
                else
                    log_info( "\tSampled pixels (rgba order):\n" );
                log_info( "\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1], upLeftA[2], upLeftA[3] );
                log_info( "\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1], upRightA[2], upRightA[3] );
                log_info( "\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1], lowLeftA[2], lowLeftA[3] );
                log_info( "\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0], lowRightA[1], lowRightA[2], lowRightA[3] );
                log_info( "\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1], upLeftB[2], upLeftB[3] );
                log_info( "\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1], upRightB[2], upRightB[3] );
                log_info( "\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1], lowLeftB[2], lowLeftB[3] );
                log_info( "\t\tp111: %f, %f, %f, %f\n", lowRightB[0], lowRightB[1], lowRightB[2], lowRightB[3] );
            }

            double weights[ 2 ][ 2 ][ 2 ];

            float a = frac( x - 0.5f ), b = frac( y - 0.5f ), c = frac( z - 0.5f );
            weights[ 0 ][ 0 ][ 0 ] = weights[ 0 ][ 1 ][ 0 ] = weights[ 0 ][ 0 ][ 1 ] = weights[ 0 ][ 1 ][ 1 ] = 1.f - a;
            weights[ 1 ][ 0 ][ 0 ] = weights[ 1 ][ 1 ][ 0 ] = weights[ 1 ][ 0 ][ 1 ] = weights[ 1 ][ 1 ][ 1 ] = a;
            weights[ 0 ][ 0 ][ 0 ] *= 1.f - b;
            weights[ 1 ][ 0 ][ 0 ] *= 1.f - b;
            weights[ 0 ][ 0 ][ 1 ] *= 1.f - b;
            weights[ 1 ][ 0 ][ 1 ] *= 1.f - b;
            weights[ 0 ][ 1 ][ 0 ] *= b;
            weights[ 1 ][ 1 ][ 0 ] *= b;
            weights[ 0 ][ 1 ][ 1 ] *= b;
            weights[ 1 ][ 1 ][ 1 ] *= b;
            weights[ 0 ][ 0 ][ 0 ] *= 1.f - c;
            weights[ 0 ][ 1 ][ 0 ] *= 1.f - c;
            weights[ 1 ][ 0 ][ 0 ] *= 1.f - c;
            weights[ 1 ][ 1 ][ 0 ] *= 1.f - c;
            weights[ 0 ][ 0 ][ 1 ] *= c;
            weights[ 0 ][ 1 ][ 1 ] *= c;
            weights[ 1 ][ 0 ][ 1 ] *= c;
            weights[ 1 ][ 1 ][ 1 ] *= c;

            if( verbose )
                log_info( "\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f, frac( z - 0.5f ) = %f\n",
                         frac( x - 0.5f ), frac( y - 0.5f ), frac( z - 0.5f )  );

            for( int i = 0; i < 4; i++ )
            {
                outData[ i ] = (float)( ( upLeftA[ i ] * weights[ 0 ][ 0 ][ 0 ] ) +
                                       ( upRightA[ i ] * weights[ 1 ][ 0 ][ 0 ] ) +
                                       ( lowLeftA[ i ] * weights[ 0 ][ 1 ][ 0 ] ) +
                                       ( lowRightA[ i ] * weights[ 1 ][ 1 ][ 0 ] ) +
                                       ( upLeftB[ i ] * weights[ 0 ][ 0 ][ 1 ] ) +
                                       ( upRightB[ i ] * weights[ 1 ][ 0 ][ 1 ] ) +
                                       ( lowLeftB[ i ] * weights[ 0 ][ 1 ][ 1 ] ) +
                                       ( lowRightB[ i ] * weights[ 1 ][ 1 ][ 1 ] ));

                // flush subnormal results to zero if necessary
                if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN )
                    outData[i] = copysignf( 0.0f, outData[i] );
            }
        }

        return returnVal;
    }
}


int debug_find_vector_in_image( void *imagePtr, image_descriptor *imageInfo,
                               void *vectorToFind, size_t vectorSize, int *outX, int *outY, int *outZ )
{
    int foundCount = 0;
    char *iPtr = (char *)imagePtr;
    size_t width;
    size_t depth;
    size_t height;

    switch (imageInfo->type)
    {
        case CL_MEM_OBJECT_IMAGE1D:
            width = imageInfo->width; height = 1; depth = 1;
            break;
        case CL_MEM_OBJECT_IMAGE1D_ARRAY:
            width = imageInfo->width; height = 1; depth = imageInfo->arraySize;
            break;
        case CL_MEM_OBJECT_IMAGE2D:
            width = imageInfo->width; height = imageInfo->height; depth = 1;
            break;
        case CL_MEM_OBJECT_IMAGE2D_ARRAY:
            width = imageInfo->width; height = imageInfo->height; depth = imageInfo->arraySize;
            break;
        case CL_MEM_OBJECT_IMAGE3D:
            width = imageInfo->width; height = imageInfo->height; depth = imageInfo->depth;
            break;
    }

    for( size_t z = 0; z < depth; z++ )
    {
        for( size_t y = 0; y < height; y++ )
        {
            for( size_t x = 0; x < imageInfo->width; x++)
            {
                if( memcmp( iPtr, vectorToFind, vectorSize ) == 0 )
                {
                    if( foundCount == 0 )
                    {
                        *outX = (int)x;
                        if (outY != NULL)
                            *outY = (int)y;
                        if( outZ != NULL )
                            *outZ = (int)z;
                    }
                    foundCount++;
                }
                iPtr += vectorSize;
            }
            iPtr += imageInfo->rowPitch - ( width * vectorSize );
        }
        iPtr += imageInfo->slicePitch - ( height * imageInfo->rowPitch );
    }
    return foundCount;
}

int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
                              unsigned int *valuesToFind, int *outX, int *outY, int *outZ )
{
    char vectorToFind[ 4 * 4 ];
    size_t vectorSize = get_format_channel_count( imageInfo->format );


    if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8 )
    {
        unsigned char *p = (unsigned char *)vectorToFind;
        for( unsigned int i = 0; i < vectorSize; i++ )
            p[i] = (unsigned char)valuesToFind[i];
    }
    else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16 )
    {
        unsigned short *p = (unsigned short *)vectorToFind;
        for( unsigned int i = 0; i < vectorSize; i++ )
            p[i] = (unsigned short)valuesToFind[i];
        vectorSize *= 2;
    }
    else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32 )
    {
        unsigned int *p = (unsigned int *)vectorToFind;
        for( unsigned int i = 0; i < vectorSize; i++ )
            p[i] = (unsigned int)valuesToFind[i];
        vectorSize *= 4;
    }
    else
    {
        log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" );
        return false;
    }
    return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ );
}

int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
                              int *valuesToFind, int *outX, int *outY, int *outZ )
{
    char vectorToFind[ 4 * 4 ];
    size_t vectorSize = get_format_channel_count( imageInfo->format );

    if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT8 )
    {
        char *p = (char *)vectorToFind;
        for( unsigned int i = 0; i < vectorSize; i++ )
            p[i] = (char)valuesToFind[i];
    }
    else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT16 )
    {
        short *p = (short *)vectorToFind;
        for( unsigned int i = 0; i < vectorSize; i++ )
            p[i] = (short)valuesToFind[i];
        vectorSize *= 2;
    }
    else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT32 )
    {
        int *p = (int *)vectorToFind;
        for( unsigned int i = 0; i < vectorSize; i++ )
            p[i] = (int)valuesToFind[i];
        vectorSize *= 4;
    }
    else
    {
        log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" );
        return false;
    }
    return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ );
}

int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
                              float *valuesToFind, int *outX, int *outY, int *outZ )
{
    char vectorToFind[ 4 * 4 ];
    float swizzled[4];
    memcpy( swizzled, valuesToFind, sizeof( swizzled ) );
    size_t vectorSize = get_pixel_size( imageInfo->format );
    pack_image_pixel( swizzled, imageInfo->format, vectorToFind );
    return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ );
}

template <class T> void swizzle_vector_for_image( T *srcVector, const cl_image_format *imageFormat )
{
    T temp;
    switch( imageFormat->image_channel_order )
    {
        case CL_A:
            srcVector[ 0 ] = srcVector[ 3 ];
            break;
        case CL_R:
        case CL_Rx:
        case CL_RG:
        case CL_RGx:
        case CL_RGB:
        case CL_RGBx:
        case CL_RGBA:
            break;
        case CL_RA:
            srcVector[ 1 ] = srcVector[ 3 ];
            break;
        case CL_ARGB:
            temp = srcVector[ 3 ];
            srcVector[ 3 ] = srcVector[ 2 ];
            srcVector[ 2 ] = srcVector[ 1 ];
            srcVector[ 1 ] = srcVector[ 0 ];
            srcVector[ 0 ] = temp;
            break;
        case CL_BGRA:
            temp = srcVector[ 0 ];
            srcVector[ 0 ] = srcVector[ 2 ];
            srcVector[ 2 ] = temp;
            break;
        case CL_INTENSITY:
            srcVector[ 3 ] = srcVector[ 0 ];
            srcVector[ 2 ] = srcVector[ 0 ];
            srcVector[ 1 ] = srcVector[ 0 ];
            break;
        case CL_LUMINANCE:
            srcVector[ 2 ] = srcVector[ 0 ];
            srcVector[ 1 ] = srcVector[ 0 ];
            break;
#ifdef CL_1RGB_APPLE
        case CL_1RGB_APPLE:
            temp = srcVector[ 3 ];
            srcVector[ 3 ] = srcVector[ 2 ];
            srcVector[ 2 ] = srcVector[ 1 ];
            srcVector[ 1 ] = srcVector[ 0 ];
            srcVector[ 0 ] = temp;
            break;
#endif
#ifdef CL_BGR1_APPLE
        case CL_BGR1_APPLE:
            temp = srcVector[ 0 ];
            srcVector[ 0 ] = srcVector[ 2 ];
            srcVector[ 2 ] = temp;
            break;
#endif
    }
}

#define SATURATE( v, min, max ) ( v < min ? min : ( v > max ? max : v ) )

void pack_image_pixel( unsigned int *srcVector, const cl_image_format *imageFormat, void *outData )
{
    swizzle_vector_for_image<unsigned int>( srcVector, imageFormat );
    size_t channelCount = get_format_channel_count( imageFormat );

    switch( imageFormat->image_channel_data_type )
    {
        case CL_UNSIGNED_INT8:
        {
            unsigned char *ptr = (unsigned char *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (unsigned char)SATURATE( srcVector[ i ], 0, 255 );
            break;
        }
        case CL_UNSIGNED_INT16:
        {
            unsigned short *ptr = (unsigned short *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (unsigned short)SATURATE( srcVector[ i ], 0, 65535 );
            break;
        }
        case CL_UNSIGNED_INT32:
        {
            unsigned int *ptr = (unsigned int *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (unsigned int)srcVector[ i ];
            break;
        }
        default:
            break;
    }
}

void pack_image_pixel( int *srcVector, const cl_image_format *imageFormat, void *outData )
{
    swizzle_vector_for_image<int>( srcVector, imageFormat );
    size_t chanelCount = get_format_channel_count( imageFormat );

    switch( imageFormat->image_channel_data_type )
    {
        case CL_SIGNED_INT8:
        {
            char *ptr = (char *)outData;
            for( unsigned int i = 0; i < chanelCount; i++ )
                ptr[ i ] = (char)SATURATE( srcVector[ i ], -128, 127 );
            break;
        }
        case CL_SIGNED_INT16:
        {
            short *ptr = (short *)outData;
            for( unsigned int i = 0; i < chanelCount; i++ )
                ptr[ i ] = (short)SATURATE( srcVector[ i ], -32768, 32767 );
            break;
        }
        case CL_SIGNED_INT32:
        {
            int *ptr = (int *)outData;
            for( unsigned int i = 0; i < chanelCount; i++ )
                ptr[ i ] = (int)srcVector[ i ];
            break;
        }
        default:
            break;
    }
}

int round_to_even( float v )
{
    // clamp overflow
    if( v >= - (float) INT_MIN )
        return INT_MAX;
    if( v <= (float) INT_MIN )
        return INT_MIN;

    // round fractional values to integer value
    if( fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23) )
    {
        static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23), MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) };
        float magicVal = magic[ v < 0.0f ];
        v += magicVal;
        v -= magicVal;
    }

    return (int) v;
}

#define NORMALIZE( v, max ) ( v < 0 ? 0 : ( v > 1.f ? max : round_to_even( v * max ) ) )
#define NORMALIZE_UNROUNDED( v, max ) ( v < 0 ? 0 : ( v > 1.f ? max :  v * max ) )
#define NORMALIZE_SIGNED( v, min, max ) ( v  < -1.0f ? min : ( v > 1.f ? max : round_to_even( v * max ) ) )
#define NORMALIZE_SIGNED_UNROUNDED( v, min, max ) ( v  < -1.0f ? min : ( v > 1.f ? max : v * max ) )
#define CONVERT_INT( v, min, max, max_val)  ( v < min ? min : ( v > max ? max_val : round_to_even( v ) ) )
#define CONVERT_UINT( v, max, max_val)  ( v < 0 ? 0 : ( v > max ? max_val : round_to_even( v ) ) )

void pack_image_pixel( float *srcVector, const cl_image_format *imageFormat, void *outData )
{
    swizzle_vector_for_image<float>( srcVector, imageFormat );
    size_t channelCount = get_format_channel_count( imageFormat );
    switch( imageFormat->image_channel_data_type )
    {
        case CL_HALF_FLOAT:
        {
            cl_ushort *ptr = (cl_ushort *)outData;

            switch( gFloatToHalfRoundingMode )
            {
                case kRoundToNearestEven:
            for( unsigned int i = 0; i < channelCount; i++ )
                        ptr[ i ] = float2half_rte( srcVector[ i ] );
            break;
                case kRoundTowardZero:
                    for( unsigned int i = 0; i < channelCount; i++ )
                        ptr[ i ] = float2half_rtz( srcVector[ i ] );
                    break;
                default:
                    log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" );
                    exit(-1);
                    break;
        }
            break;
        }

        case CL_FLOAT:
        {
            cl_float *ptr = (cl_float *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = srcVector[ i ];
            break;
        }

        case CL_SNORM_INT8:
        {
            cl_char *ptr = (cl_char *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (char)NORMALIZE_SIGNED( srcVector[ i ], -127.0f, 127.f );
            break;
        }
        case CL_SNORM_INT16:
        {
            cl_short *ptr = (cl_short *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (short)NORMALIZE_SIGNED( srcVector[ i ], -32767.f, 32767.f  );
            break;
        }
        case CL_UNORM_INT8:
        {
            cl_uchar *ptr = (cl_uchar *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (unsigned char)NORMALIZE( srcVector[ i ], 255.f );
#ifdef CL_1RGB_APPLE
            if( imageFormat->image_channel_order == CL_1RGB_APPLE )
                ptr[0] = 255.0f;
#endif
#ifdef CL_BGR1_APPLE
            if( imageFormat->image_channel_order == CL_BGR1_APPLE )
                ptr[3] = 255.0f;
#endif
            break;
        }
        case CL_UNORM_INT16:
        {
            cl_ushort *ptr = (cl_ushort *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (unsigned short)NORMALIZE( srcVector[ i ], 65535.f );
            break;
        }
        case CL_UNORM_SHORT_555:
        {
            cl_ushort *ptr = (cl_ushort *)outData;
            ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 10 ) |
            ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 31.f ) & 31 ) << 5 ) |
            ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 );
            break;
        }
        case CL_UNORM_SHORT_565:
        {
            cl_ushort *ptr = (cl_ushort *)outData;
            ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 11 ) |
            ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 63.f ) & 63 ) << 5 ) |
            ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 );
            break;
        }
        case CL_UNORM_INT_101010:
        {
            cl_uint *ptr = (cl_uint *)outData;
            ptr[ 0 ] = ( ( (unsigned int)NORMALIZE( srcVector[ 0 ], 1023.f ) & 1023 ) << 20 ) |
            ( ( (unsigned int)NORMALIZE( srcVector[ 1 ], 1023.f ) & 1023 ) << 10 ) |
            ( ( (unsigned int)NORMALIZE( srcVector[ 2 ], 1023.f ) & 1023 ) << 0 );
            break;
        }
        case CL_SIGNED_INT8:
        {
            cl_char *ptr = (cl_char *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (char)CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 );
            break;
        }
        case CL_SIGNED_INT16:
        {
            cl_short *ptr = (cl_short *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (short)CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767  );
            break;
        }
        case CL_SIGNED_INT32:
        {
            cl_int *ptr = (cl_int *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (int)CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX  );
            break;
        }
        case CL_UNSIGNED_INT8:
        {
            cl_uchar *ptr = (cl_uchar *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (cl_uchar)CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX );
            break;
        }
        case CL_UNSIGNED_INT16:
        {
            cl_ushort *ptr = (cl_ushort *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (cl_ushort)CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX );
            break;
        }
        case CL_UNSIGNED_INT32:
        {
            cl_uint *ptr = (cl_uint *)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
                ptr[ i ] = (cl_uint)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX  );
            break;
        }
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
        {
            cl_ushort *ptr = (cl_ushort*)outData;
            for( unsigned int i = 0; i < channelCount; i++ )
            {
                cl_float f = fmaxf( srcVector[i], -1.0f );
                f = fminf( f, 3.0f );
                cl_int d = rintf(f * 0x1.0p14f);
                d += 16384;
                if( d > CL_USHRT_MAX )
                    d = CL_USHRT_MAX;
                ptr[i] = d;
            }
            break;
        }
#endif
        default:
            log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type);
            exit(-1);
            break;
    }
}

void pack_image_pixel_error( const float *srcVector, const cl_image_format *imageFormat, const void *results, float *errors )
{
    size_t channelCount = get_format_channel_count( imageFormat );
    switch( imageFormat->image_channel_data_type )
    {
        case CL_HALF_FLOAT:
        {
            const cl_ushort *ptr = (const cl_ushort *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = Ulp_Error_Half( ptr[i], srcVector[i] );

            break;
        }

        case CL_FLOAT:
        {
            const cl_ushort *ptr = (const cl_ushort *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = Ulp_Error( ptr[i], srcVector[i] );

            break;
        }

        case CL_SNORM_INT8:
        {
            const cl_char *ptr = (const cl_char *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -127.0f, 127.f );

            break;
        }
        case CL_SNORM_INT16:
        {
            const cl_short *ptr = (const cl_short *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -32767.f, 32767.f  );

            break;
        }
        case CL_UNORM_INT8:
        {
            const cl_uchar *ptr = (const cl_uchar *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 255.f  );

            break;
        }
        case CL_UNORM_INT16:
        {
            const cl_ushort *ptr = (const cl_ushort *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 65535.f  );

            break;
        }
        case CL_UNORM_SHORT_555:
        {
            const cl_ushort *ptr = (const cl_ushort *)results;

            errors[0] = ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f );
            errors[1] = ((ptr[0] >>  5) & 31) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 31.f );
            errors[2] = ((ptr[0] >>  0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f );

            break;
        }
        case CL_UNORM_SHORT_565:
        {
            const cl_ushort *ptr = (const cl_ushort *)results;

            errors[0] = ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f );
            errors[1] = ((ptr[0] >>  5) & 63) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 63.f );
            errors[2] = ((ptr[0] >>  0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f );

            break;
        }
        case CL_UNORM_INT_101010:
        {
            const cl_uint *ptr = (const cl_uint *)results;

            errors[0] = ((ptr[0] >> 20) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 1023.f );
            errors[1] = ((ptr[0] >> 10) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 1023.f );
            errors[2] = ((ptr[0] >>  0) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 1023.f );

            break;
        }
        case CL_SIGNED_INT8:
        {
            const cl_char *ptr = (const cl_char *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[ i ] = ptr[i] - CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 );

            break;
        }
        case CL_SIGNED_INT16:
        {
            const cl_short *ptr = (const cl_short *)results;
            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = ptr[ i ] - CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767  );
            break;
        }
        case CL_SIGNED_INT32:
        {
            const cl_int *ptr = (const cl_int *)results;
            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long) CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX  ));
            break;
        }
        case CL_UNSIGNED_INT8:
        {
            const cl_uchar *ptr = (const cl_uchar *)results;
            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX );
            break;
        }
        case CL_UNSIGNED_INT16:
        {
            const cl_ushort *ptr = (const cl_ushort *)results;
            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX );
            break;
        }
        case CL_UNSIGNED_INT32:
        {
            const cl_uint *ptr = (const cl_uint *)results;
            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX  ));
            break;
        }
#ifdef CL_SFIXED14_APPLE
        case CL_SFIXED14_APPLE:
        {
            const cl_ushort *ptr = (const cl_ushort *)results;

            for( unsigned int i = 0; i < channelCount; i++ )
                errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( ((int) srcVector[ i ] - 16384), -16384.f, 49151.f  );

            break;
        }
#endif
        default:
            log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type);
            exit(-1);
            break;
    }
}


//
//  Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT
//  This should be called lazily before attempting to verify image writes, otherwise an error will occur.
//
int  DetectFloatToHalfRoundingMode( cl_command_queue q )  // Returns CL_SUCCESS on success
{
    cl_int err = CL_SUCCESS;

    if( gFloatToHalfRoundingMode == kDefaultRoundingMode )
    {
        // Some numbers near 0.5f, that we look at to see how the values are rounded.
        static const cl_uint  inData[4*4] = {   0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U, 0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U,
                                                0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U, 0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U    };
        static const size_t count = sizeof( inData ) / (4*sizeof( inData[0] ));
        const float *inp = (const float*) inData;
        cl_context context = NULL;

    // Create an input buffer
        err = clGetCommandQueueInfo( q, CL_QUEUE_CONTEXT, sizeof(context), &context, NULL );
        if( err )
        {
            log_error( "Error:  could not get context from command queue in DetectFloatToHalfRoundingMode  (%d)", err );
            return err;
        }

        cl_mem inBuf = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, sizeof( inData ), (void*) inData, &err );
        if( NULL == inBuf || err )
        {
            log_error( "Error:  could not create input buffer in DetectFloatToHalfRoundingMode  (err: %d)", err );
            return err;
        }

    // Create a small output image
        cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT };
        cl_mem outImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &fmt, count, 1, 0, NULL, &err );
        if( NULL == outImage || err )
        {
            log_error( "Error:  could not create half float out image in DetectFloatToHalfRoundingMode  (err: %d)", err );
            clReleaseMemObject( inBuf );
            return err;
        }

    // Create our program, and a kernel
        const char *kernel[1] = {
            "kernel void detect_round( global float4 *in, write_only image2d_t out )\n"
            "{\n"
            "   write_imagef( out, (int2)(get_global_id(0),0), in[get_global_id(0)] );\n"
            "}\n" };
        cl_program program = clCreateProgramWithSource( context, 1, kernel, NULL, &err );
        if( NULL == program || err )
        {
            log_error( "Error:  could not create program in DetectFloatToHalfRoundingMode (err: %d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            return err;
        }

        cl_device_id device = NULL;
        err = clGetCommandQueueInfo( q, CL_QUEUE_DEVICE, sizeof(device), &device, NULL );
        if( err )
        {
            log_error( "Error:  could not get device from command queue in DetectFloatToHalfRoundingMode  (%d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            clReleaseProgram( program );
            return err;
        }

        err = clBuildProgram( program, 1, &device, "", NULL, NULL );
        if( err )
        {
            log_error( "Error:  could not build program in DetectFloatToHalfRoundingMode  (%d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            clReleaseProgram( program );
            return err;
        }

        cl_kernel k = clCreateKernel( program, "detect_round", &err );
        if( NULL == k || err )
        {
            log_error( "Error:  could not create kernel in DetectFloatToHalfRoundingMode  (%d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            clReleaseProgram( program );
            return err;
        }

        err = clSetKernelArg( k, 0, sizeof( cl_mem ), &inBuf );
        if( err )
        {
            log_error( "Error: could not set argument 0 of kernel in DetectFloatToHalfRoundingMode (%d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            clReleaseProgram( program );
            clReleaseKernel( k );
            return err;
        }

        err = clSetKernelArg( k, 1, sizeof( cl_mem ), &outImage );
        if( err )
        {
            log_error( "Error: could not set argument 1 of kernel in DetectFloatToHalfRoundingMode (%d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            clReleaseProgram( program );
            clReleaseKernel( k );
            return err;
        }

    // Run the kernel
        size_t global_work_size = count;
        err = clEnqueueNDRangeKernel( q, k, 1, NULL, &global_work_size, NULL, 0, NULL, NULL );
        if( err )
        {
            log_error( "Error: could not enqueue kernel in DetectFloatToHalfRoundingMode (%d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            clReleaseProgram( program );
            clReleaseKernel( k );
            return err;
        }

    // read the results
        cl_ushort outBuf[count*4];
        memset( outBuf, -1, sizeof( outBuf ) );
        size_t origin[3] = {0,0,0};
        size_t region[3] = {count,1,1};
        err = clEnqueueReadImage( q, outImage, CL_TRUE, origin, region, 0, 0, outBuf, 0, NULL, NULL );
        if( err )
        {
            log_error( "Error: could not read output image in DetectFloatToHalfRoundingMode (%d)", err );
            clReleaseMemObject( inBuf );
            clReleaseMemObject( outImage );
            clReleaseProgram( program );
            clReleaseKernel( k );
            return err;
        }

    // Generate our list of reference results
        cl_ushort rte_ref[count*4];
        cl_ushort rtz_ref[count*4];
        for( size_t i = 0; i < 4 * count; i++ )
        {
            rte_ref[i] = float2half_rte( inp[i] );
            rtz_ref[i] = float2half_rtz( inp[i] );
        }

    // Verify that we got something in either rtz or rte mode
        if( 0 == memcmp( rte_ref, outBuf, sizeof( rte_ref )) )
        {
            log_info( "Autodetected float->half rounding mode to be rte\n" );
            gFloatToHalfRoundingMode = kRoundToNearestEven;
        }
        else if ( 0 == memcmp( rtz_ref, outBuf, sizeof( rtz_ref )) )
        {
            log_info( "Autodetected float->half rounding mode to be rtz\n" );
            gFloatToHalfRoundingMode = kRoundTowardZero;
        }
        else
        {
            log_error( "ERROR: float to half conversions proceed with invalid rounding mode!\n" );
            log_info( "\nfor:" );
            for( size_t i = 0; i < count; i++ )
                log_info( " {%a, %a, %a, %a},", inp[4*i], inp[4*i+1], inp[4*i+2], inp[4*i+3] );
            log_info( "\ngot:" );
            for( size_t i = 0; i < count; i++ )
                log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", outBuf[4*i], outBuf[4*i+1], outBuf[4*i+2], outBuf[4*i+3] );
            log_info( "\nrte:" );
            for( size_t i = 0; i < count; i++ )
                log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rte_ref[4*i], rte_ref[4*i+1], rte_ref[4*i+2], rte_ref[4*i+3] );
            log_info( "\nrtz:" );
            for( size_t i = 0; i < count; i++ )
                log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rtz_ref[4*i], rtz_ref[4*i+1], rtz_ref[4*i+2], rtz_ref[4*i+3] );
            log_info( "\n" );
            err = -1;
            gFloatToHalfRoundingMode = kRoundingModeCount;  // illegal value
        }

    // clean up
        clReleaseMemObject( inBuf );
        clReleaseMemObject( outImage );
        clReleaseProgram( program );
        clReleaseKernel( k );
        return err;
    }

    // Make sure that the rounding mode was successfully detected, if we checked earlier
    if( gFloatToHalfRoundingMode != kRoundToNearestEven && gFloatToHalfRoundingMode != kRoundTowardZero)
        return -2;

    return err;
}

char *create_random_image_data( ExplicitType dataType, image_descriptor *imageInfo, BufferOwningPtr<char> &P, MTdata d )
{
    size_t numPixels = imageInfo->width * imageInfo->height
      * (imageInfo->depth ? imageInfo->depth : 1)
      * (imageInfo->arraySize ? imageInfo->arraySize : 1);
    size_t allocSize = numPixels * 4 * get_explicit_type_size( dataType );

#if defined( __APPLE__ )
    char *data = NULL;
    if (gDeviceType == CL_DEVICE_TYPE_CPU) {
        size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192;

        void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
        intptr_t data_end = (intptr_t)map + mapSize - 4096;
        data = (char *)(data_end - (intptr_t)allocSize);

        mprotect(map, 4096, PROT_NONE);
        mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
        P.reset(data, map, mapSize);
    } else {
        data = (char *)malloc(allocSize);
        P.reset(data);
    }
#else
    char *data = (char *)malloc(allocSize);
    P.reset(data);
#endif

    switch( dataType )
    {
        case kFloat:
        {
            float *inputValues = (float *)data;
            switch (imageInfo->format->image_channel_data_type)
            {
                case CL_HALF_FLOAT:
                    {
                        // Generate data that is (mostly) inside the range of a half float
                        // const float HALF_MIN = 5.96046448e-08f;
                        const float HALF_MAX = 65504.0f;

                        size_t i = 0;
                        inputValues[ i++ ] = 0.f;
                        inputValues[ i++ ] = 1.f;
                        inputValues[ i++ ] = -1.f;
                        inputValues[ i++ ] = 2.f;
                        for( ; i < numPixels * 4; i++ )
                            inputValues[ i ] = get_random_float( -HALF_MAX - 2.f, HALF_MAX + 2.f, d );
                    }
                    break;
#ifdef CL_SFIXED14_APPLE
                case CL_SFIXED14_APPLE:
                    {
                        size_t i = 0;
                        if( numPixels * 4 >= 8 )
                        {
                            inputValues[ i++ ] = INFINITY;
                            inputValues[ i++ ] = 0x1.0p14f;
                            inputValues[ i++ ] = 0x1.0p31f;
                            inputValues[ i++ ] = 0x1.0p32f;
                            inputValues[ i++ ] = -INFINITY;
                            inputValues[ i++ ] = -0x1.0p14f;
                            inputValues[ i++ ] = -0x1.0p31f;
                            inputValues[ i++ ] = -0x1.1p31f;
                        }
                        for( ; i < numPixels * 4; i++ )
                            inputValues[ i ] = get_random_float( -1.1f, 3.1f, d );
                    }
                    break;
#endif
                case CL_FLOAT:
                    {
                        size_t i = 0;
                        inputValues[ i++ ] = INFINITY;
                        inputValues[ i++ ] = -INFINITY;
                        inputValues[ i++ ] = 0.0f;
                        inputValues[ i++ ] = 0.0f;
                        cl_uint *p = (cl_uint *)data;
                        for( ; i < numPixels * 4; i++ )
                            p[ i ] = genrand_int32(d);
                    }
                    break;

                default:
                    size_t i = 0;
                    if( numPixels * 4 >= 36 )
                    {
                        inputValues[ i++ ] = 0.0f;
                        inputValues[ i++ ] = 0.5f;
                        inputValues[ i++ ] = 31.5f;
                        inputValues[ i++ ] = 32.0f;
                        inputValues[ i++ ] = 127.5f;
                        inputValues[ i++ ] = 128.0f;
                        inputValues[ i++ ] = 255.5f;
                        inputValues[ i++ ] = 256.0f;
                        inputValues[ i++ ] = 1023.5f;
                        inputValues[ i++ ] = 1024.0f;
                        inputValues[ i++ ] = 32767.5f;
                        inputValues[ i++ ] = 32768.0f;
                        inputValues[ i++ ] = 65535.5f;
                        inputValues[ i++ ] = 65536.0f;
                        inputValues[ i++ ] = 2147483648.0f;
                        inputValues[ i++ ] = 4294967296.0f;
                        inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 );
                        inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 );
                        inputValues[ i++ ] = -0.0f;
                        inputValues[ i++ ] = -0.5f;
                        inputValues[ i++ ] = -31.5f;
                        inputValues[ i++ ] = -32.0f;
                        inputValues[ i++ ] = -127.5f;
                        inputValues[ i++ ] = -128.0f;
                        inputValues[ i++ ] = -255.5f;
                        inputValues[ i++ ] = -256.0f;
                        inputValues[ i++ ] = -1023.5f;
                        inputValues[ i++ ] = -1024.0f;
                        inputValues[ i++ ] = -32767.5f;
                        inputValues[ i++ ] = -32768.0f;
                        inputValues[ i++ ] = -65535.5f;
                        inputValues[ i++ ] = -65536.0f;
                        inputValues[ i++ ] = -2147483648.0f;
                        inputValues[ i++ ] = -4294967296.0f;
                        inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 );
                        inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 );
                    }
                    if( is_format_signed(imageInfo->format) )
                    {
                        for( ; i < numPixels * 4; i++ )
                            inputValues[ i ] = get_random_float( -1.1f, 1.1f, d );
                    }
                    else
                    {
                        for( ; i < numPixels * 4; i++ )
                            inputValues[ i ] = get_random_float( -0.1f, 1.1f, d );
                    }
                    break;
            }
        }

        case kInt:
        {
            int *imageData = (int *)data;

            // We want to generate ints (mostly) in range of the target format
            int formatMin = get_format_min_int( imageInfo->format );
            size_t formatMax = get_format_max_int( imageInfo->format );
            if( formatMin == 0 )
            {
                // Unsigned values, but we are only an int, so cap the actual max at the max of signed ints
                if( formatMax > 2147483647L )
                    formatMax = 2147483647L;
            }
            // If the final format is small enough, give us a bit of room for out-of-range values to test
            if( formatMax < 2147483647L )
                formatMax += 2;
            if( formatMin > -2147483648LL )
                formatMin -= 2;

            // Now gen
            for( size_t i = 0; i < numPixels * 4; i++ )
            {
                imageData[ i ] = random_in_range( formatMin, (int)formatMax, d );
            }
            break;
        }

        case kUInt:
        case kUnsignedInt:
        {
            unsigned int *imageData = (unsigned int *)data;

            // We want to generate ints (mostly) in range of the target format
            int formatMin = get_format_min_int( imageInfo->format );
            size_t formatMax = get_format_max_int( imageInfo->format );
            if( formatMin < 0 )
                formatMin = 0;
            // If the final format is small enough, give us a bit of room for out-of-range values to test
            if( formatMax < 4294967295LL )
                formatMax += 2;

            // Now gen
            for( size_t i = 0; i < numPixels * 4; i++ )
            {
                imageData[ i ] = random_in_range( formatMin, (int)formatMax, d );
            }
            break;
        }
        default:
            // Unsupported source format
            delete [] data;
            return NULL;
    }

    return data;
}

/*
    deprecated
bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t max, int &outValue )
{
    int v = (int)value;

    switch(imageSampler->addressing_mode)
    {
        case CL_ADDRESS_REPEAT:
            outValue = v;
            while( v < 0 )
                v += (int)max;
            while( v >= (int)max )
                v -= (int)max;
            if( v != outValue )
            {
                outValue = v;
                return true;
            }
            return false;

        case CL_ADDRESS_MIRRORED_REPEAT:
            log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do we ever use this?
            exit(-1);

        default:
            if( v < 0 )
            {
                outValue = 0;
                return true;
            }
            if( v >= (int)max )
            {
                outValue = (int)max - 1;
                return true;
            }
            outValue = v;
            return false;
    }

}
*/

void get_sampler_kernel_code( image_sampler_data *imageSampler, char *outLine )
{
    const char *normalized;
    const char *addressMode;
    const char *filterMode;

    if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP )
        addressMode = "CLK_ADDRESS_CLAMP";
    else if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE )
        addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE";
    else if( imageSampler->addressing_mode == CL_ADDRESS_REPEAT )
        addressMode = "CLK_ADDRESS_REPEAT";
    else if( imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT )
        addressMode = "CLK_ADDRESS_MIRRORED_REPEAT";
    else if( imageSampler->addressing_mode == CL_ADDRESS_NONE )
        addressMode = "CLK_ADDRESS_NONE";
    else
    {
        log_error( "**Error: Unknown addressing mode! Aborting...\n" );
        abort();
    }

    if( imageSampler->normalized_coords )
        normalized = "CLK_NORMALIZED_COORDS_TRUE";
    else
        normalized = "CLK_NORMALIZED_COORDS_FALSE";

    if( imageSampler->filter_mode == CL_FILTER_LINEAR )
        filterMode = "CLK_FILTER_LINEAR";
    else
        filterMode = "CLK_FILTER_NEAREST";

    sprintf( outLine, "    const sampler_t imageSampler = %s | %s | %s;\n", addressMode, filterMode, normalized );
}

void copy_image_data( image_descriptor *srcImageInfo, image_descriptor *dstImageInfo, void *imageValues, void *destImageValues,
                     const size_t sourcePos[], const size_t destPos[], const size_t regionSize[] )
{
    //  assert( srcImageInfo->format == dstImageInfo->format );

    size_t pixelSize = get_pixel_size( srcImageInfo->format );

    // Get initial pointers
    char *sourcePtr = (char *)imageValues + sourcePos[ 2 ] * srcImageInfo->slicePitch + sourcePos[ 1 ] * srcImageInfo->rowPitch + pixelSize * sourcePos[ 0 ];
    char *destPtr = (char *)destImageValues + destPos[ 2 ] * dstImageInfo->slicePitch + destPos[ 1 ] * dstImageInfo->rowPitch + pixelSize * destPos[ 0 ];

    for( size_t z = 0; z < ( regionSize[ 2 ] > 0 ? regionSize[ 2 ] : 1 ); z++ )
    {
        char *rowSourcePtr = sourcePtr;
        char *rowDestPtr = destPtr;
        for( size_t y = 0; y < regionSize[ 1 ]; y++ )
        {
            memcpy( rowDestPtr, rowSourcePtr, pixelSize * regionSize[ 0 ] );
            rowSourcePtr += srcImageInfo->rowPitch;
            rowDestPtr += dstImageInfo->rowPitch;
        }

        sourcePtr += srcImageInfo->slicePitch;
        destPtr += dstImageInfo->slicePitch;
    }
}

float random_float(float low, float high, MTdata d)
{
    float t = (float) genrand_real1(d);
    return (1.0f - t) * low + t * high;
}

CoordWalker::CoordWalker( void * coords, bool useFloats, size_t vecSize )
{
    if( useFloats )
    {
        mFloatCoords = (cl_float *)coords;
        mIntCoords = NULL;
    }
    else
    {
        mFloatCoords = NULL;
        mIntCoords = (cl_int *)coords;
    }
    mVecSize = vecSize;
}

CoordWalker::~CoordWalker()
{
}

cl_float CoordWalker::Get( size_t idx, size_t el )
{
    if( mIntCoords != NULL )
        return (cl_float)mIntCoords[ idx * mVecSize + el ];
    else
        return mFloatCoords[ idx * mVecSize + el ];
}


void print_read_header( cl_image_format *format, image_sampler_data *sampler, bool err, int t )
{
    const char *addressMode = NULL;
    const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" };

    if( sampler->addressing_mode == CL_ADDRESS_CLAMP )
        addressMode = "CL_ADDRESS_CLAMP";
    else if( sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE )
        addressMode = "CL_ADDRESS_CLAMP_TO_EDGE";
    else if( sampler->addressing_mode == CL_ADDRESS_REPEAT )
        addressMode = "CL_ADDRESS_REPEAT";
    else if( sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT )
        addressMode = "CL_ADDRESS_MIRRORED_REPEAT";
    else
        addressMode = "CL_ADDRESS_NONE";

    if( t )
    {
        if( err )
            log_error( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
                      GetChannelTypeName( format->image_channel_data_type ),
                      (int)get_format_channel_count( format ),
                      sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
                      addressMode,
                      normalizedNames[sampler->normalized_coords ? 1 : 0],
                      t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" );
        else
            log_info( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
                     GetChannelTypeName( format->image_channel_data_type ),
                     (int)get_format_channel_count( format ),
                     sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
                     addressMode,
                     normalizedNames[sampler->normalized_coords ? 1 : 0],
                     t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" );
    }
    else
    {
        if( err )
            log_error( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
                      GetChannelTypeName( format->image_channel_data_type ),
                      (int)get_format_channel_count( format ),
                      sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
                      addressMode,
                      normalizedNames[sampler->normalized_coords ? 1 : 0] );
        else
            log_info( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
                     GetChannelTypeName( format->image_channel_data_type ),
                     (int)get_format_channel_count( format ),
                     sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
                     addressMode,
                     normalizedNames[sampler->normalized_coords ? 1 : 0] );
    }

}

void print_write_header( cl_image_format *format, bool err = false)
{
    if( err )
        log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
                  GetChannelTypeName( format->image_channel_data_type ),
                  (int)get_format_channel_count( format ) );
    else
        log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
                 GetChannelTypeName( format->image_channel_data_type ),
                 (int)get_format_channel_count( format ) );
}


void print_header( cl_image_format *format, bool err = false )
{
    if (err) {
        log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
                  GetChannelTypeName( format->image_channel_data_type ),
                  (int)get_format_channel_count( format ) );
    } else {
        log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
                 GetChannelTypeName( format->image_channel_data_type ),
                 (int)get_format_channel_count( format ) );
    }
}

bool find_format( cl_image_format *formatList, unsigned int numFormats, cl_image_format *formatToFind )
{
    for( unsigned int i = 0; i < numFormats; i++ )
    {
        if( formatList[ i ].image_channel_order == formatToFind->image_channel_order &&
           formatList[ i ].image_channel_data_type == formatToFind->image_channel_data_type )
            return true;
    }
    return false;
}

bool check_minimum_supported( cl_image_format *formatList, unsigned int numFormats, cl_mem_flags flags )
{
    cl_image_format readFormatsToSupport[] = { { CL_RGBA, CL_UNORM_INT8 },
        { CL_RGBA, CL_UNORM_INT16 },
        { CL_RGBA, CL_SIGNED_INT8 },
        { CL_RGBA, CL_SIGNED_INT16 },
        { CL_RGBA, CL_SIGNED_INT32 },
        { CL_RGBA, CL_UNSIGNED_INT8 },
        { CL_RGBA, CL_UNSIGNED_INT16 },
        { CL_RGBA, CL_UNSIGNED_INT32 },
        { CL_RGBA, CL_HALF_FLOAT },
        { CL_RGBA, CL_FLOAT },
        { CL_BGRA, CL_UNORM_INT8} };

    cl_image_format writeFormatsToSupport[] = { { CL_RGBA, CL_UNORM_INT8 },
        { CL_RGBA, CL_UNORM_INT16 },
        { CL_RGBA, CL_SIGNED_INT8 },
        { CL_RGBA, CL_SIGNED_INT16 },
        { CL_RGBA, CL_SIGNED_INT32 },
        { CL_RGBA, CL_UNSIGNED_INT8 },
        { CL_RGBA, CL_UNSIGNED_INT16 },
        { CL_RGBA, CL_UNSIGNED_INT32 },
        { CL_RGBA, CL_HALF_FLOAT },
        { CL_RGBA, CL_FLOAT },
        { CL_BGRA, CL_UNORM_INT8} };

    cl_image_format *formatsToTest;
    unsigned int testCount;
    bool passed = true;

    if( flags == CL_MEM_READ_ONLY )
    {
        formatsToTest = readFormatsToSupport;
        testCount = sizeof( readFormatsToSupport ) / sizeof( readFormatsToSupport[ 0 ] );
    }
    else
    {
        formatsToTest = writeFormatsToSupport;
        testCount = sizeof( writeFormatsToSupport ) / sizeof( writeFormatsToSupport[ 0 ] );
    }

    for( unsigned int i = 0; i < testCount; i++ )
    {
        if( !find_format( formatList, numFormats, &formatsToTest[ i ] ) )
        {
            log_error( "ERROR: Format required by OpenCL 1.0 is not supported: " );
            print_header( &formatsToTest[ i ], true );
            gTestCount++;
            gTestFailure++;
            passed = false;
        }
    }
    return passed;
}