Add fp16 testing to conversions and bruteforce (#1975)

Merge the `fp16-staging` branch into `main`, adding fp16 (`half`) testing to the conversions and math bruteforce tests. --------- Signed-off-by: Ahmed Hesham <ahmed.hesham@arm.com> Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com> Signed-off-by: Guo, Yilong <yilong.guo@intel.com> Signed-off-by: John Kesapides <john.kesapides@arm.com> Co-authored-by: Marcin Hajder <marcin.hajder@gmail.com> Co-authored-by: Ewan Crawford <ewan@codeplay.com> Co-authored-by: Wawiorko, Grzegorz <grzegorz.wawiorko@intel.com> Co-authored-by: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com> Co-authored-by: Harald van Dijk <harald@gigawatt.nl> Co-authored-by: Ben Ashbaugh <ben.ashbaugh@intel.com> Co-authored-by: Haonan Yang <haonan.yang@intel.com> Co-authored-by: Ahmed Hesham <117350656+ahesham-arm@users.noreply.github.com> Co-authored-by: niranjanjoshi121 <43807392+niranjanjoshi121@users.noreply.github.com> Co-authored-by: Wenwan Xing <wenwan.xing@intel.com> Co-authored-by: Yilong Guo <yilong.guo@intel.com> Co-authored-by: Romaric Jodin <89833130+rjodinchr@users.noreply.github.com> Co-authored-by: joshqti <127994991+joshqti@users.noreply.github.com> Co-authored-by: Pekka Jääskeläinen <pekka.jaaskelainen@tuni.fi> Co-authored-by: imilenkovic00 <155085410+imilenkovic00@users.noreply.github.com> Co-authored-by: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com> Co-authored-by: Aharon Abramson <aharon.abramson@mobileye.com>
2026-03-19 06:09:01 +00:00 · 2024-06-18 18:43:11 +02:00
parent b3c89ebde0
commit b6941b6c61
30 changed files with 7149 additions and 350 deletions
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2017-2024 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -48,6 +48,7 @@

 #include <vector>
 #include <type_traits>
+#include <cmath>

 #include "basic_test_conversions.h"

@@ -86,9 +87,13 @@ int gWimpyReductionFactor = 128;
 int gSkipTesting = 0;
 int gForceFTZ = 0;
 int gIsRTZ = 0;
+int gForceHalfFTZ = 0;
+int gIsHalfRTZ = 0;
 uint32_t gSimdSize = 1;
 int gHasDouble = 0;
 int gTestDouble = 1;
+int gHasHalfs = 0;
+int gTestHalfs = 1;
 const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
 int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
 int gMinVectorSize = 0;
@@ -100,6 +105,8 @@ int argCount = 0;

 double SubtractTime(uint64_t endTime, uint64_t startTime);

+cl_half_rounding_mode DataInitInfo::halfRoundingMode = CL_HALF_RTE;
+cl_half_rounding_mode ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTE;

 // clang-format off
 // for readability sake keep this section unformatted
@@ -256,8 +263,30 @@ std::vector<double> DataInitInfo::specialValuesDouble = {
    MAKE_HEX_DOUBLE(0x1.fffffffefffffp62, 0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(0x1.ffffffffp62, 0x1ffffffffLL, 30),
    MAKE_HEX_DOUBLE(0x1.ffffffff00001p62, 0x1ffffffff00001LL, 10),
 };
-// clang-format on

+// A table of more difficult cases to get right
+std::vector<cl_half> DataInitInfo::specialValuesHalf = {
+    0xffff,
+    0x0000,
+    0x0001,
+    0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
+};
+// clang-format on

 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
@@ -282,15 +311,32 @@ static inline void Force64BitFPUPrecision(void)
 #endif
 }

-
-template <typename InType, typename OutType>
-int CalcRefValsPat<InType, OutType>::check_result(void *test, uint32_t count,
-                                                  int vectorSize)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+int CalcRefValsPat<InType, OutType, InFP, OutFP>::check_result(void *test,
+                                                               uint32_t count,
+                                                               int vectorSize)
 {
    const cl_uchar *a = (const cl_uchar *)gAllowZ;

-    if (std::is_integral<OutType>::value)
-    { // char/uchar/short/ushort/int/uint/long/ulong
+    if (is_half<OutType, OutFP>())
+    {
+        const cl_half *t = (const cl_half *)test;
+        const cl_half *c = (const cl_half *)gRef;
+
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] &&
+                // Allow nan's to be binary different
+                !((t[i] & 0x7fff) > 0x7C00 && (c[i] & 0x7fff) > 0x7C00)
+                && !(a[i] != (cl_uchar)0 && t[i] == (c[i] & 0x8000)))
+            {
+                vlog(
+                    "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
+                    vectorSize, i, HTF(c[i]), HTF(t[i]));
+                return i + 1;
+            }
+    }
+    else if (std::is_integral<OutType>::value)
+    { // char/uchar/short/ushort/half/int/uint/long/ulong
        const OutType *t = (const OutType *)test;
        const OutType *c = (const OutType *)gRef;
        for (uint32_t i = 0; i < count; i++)
@@ -388,6 +434,20 @@ cl_int CustomConversionsTest::Run()
            continue;
        }

+        // skip half if we don't have it
+        if (!gTestHalfs && (inType == khalf || outType == khalf))
+        {
+            if (gHasHalfs)
+            {
+                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                           gTypeNames[outType], gSaturationNames[sat],
+                           gRoundingModeNames[round], gTypeNames[inType]);
+                vlog("\t\tcl_khr_fp16 enabled, but half testing turned "
+                     "off.\n");
+            }
+            continue;
+        }
+
        // skip longs on embedded
        if (!gHasLong
            && (inType == klong || outType == klong || inType == kulong
@@ -427,8 +487,8 @@ ConversionsTest::ConversionsTest(cl_device_id device, cl_context context,
                                 cl_command_queue queue)
    : context(context), device(device), queue(queue), num_elements(0),
      typeIterator({ cl_uchar(0), cl_char(0), cl_ushort(0), cl_short(0),
-                     cl_uint(0), cl_int(0), cl_float(0), cl_double(0),
-                     cl_ulong(0), cl_long(0) })
+                     cl_uint(0), cl_int(0), cl_half(0), cl_float(0),
+                     cl_double(0), cl_ulong(0), cl_long(0) })
 {}


@@ -445,11 +505,31 @@ cl_int ConversionsTest::Run()
 cl_int ConversionsTest::SetUp(int elements)
 {
    num_elements = elements;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        const cl_device_fp_config fpConfigHalf =
+            get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG);
+        if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            DataInitInfo::halfRoundingMode = CL_HALF_RTE;
+            ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTE;
+        }
+        else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0)
+        {
+            DataInitInfo::halfRoundingMode = CL_HALF_RTZ;
+            ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTZ;
+        }
+        else
+        {
+            log_error("Error while acquiring half rounding mode");
+            return TEST_FAIL;
+        }
+    }
+
    return CL_SUCCESS;
 }

-
-template <typename InType, typename OutType>
+template <typename InType, typename OutType, bool InFP, bool OutFP>
 void ConversionsTest::TestTypesConversion(const Type &inType,
                                          const Type &outType, int &testNumber,
                                          int startMinVectorSize)
@@ -470,7 +550,8 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
         sat = (SaturationMode)(sat + 1))
    {
        // skip illegal saturated conversions to float type
-        if (kSaturated == sat && (outType == kfloat || outType == kdouble))
+        if (kSaturated == sat
+            && (outType == kfloat || outType == kdouble || outType == khalf))
        {
            continue;
        }
@@ -507,6 +588,20 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
                continue;
            }

+            // skip half if we don't have it
+            if (!gTestHalfs && (inType == khalf || outType == khalf))
+            {
+                if (gHasHalfs)
+                {
+                    vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                               gTypeNames[outType], gSaturationNames[sat],
+                               gRoundingModeNames[round], gTypeNames[inType]);
+                    vlog("\t\tcl_khr_fp16 enabled, but half testing turned "
+                         "off.\n");
+                }
+                continue;
+            }
+
            // Skip the implicit converts if the rounding mode is
            // not default or test is saturated
            if (0 == startMinVectorSize)
@@ -517,7 +612,8 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
                    gMinVectorSize = 0;
            }

-            if ((error = DoTest<InType, OutType>(outType, inType, sat, round)))
+            if ((error = DoTest<InType, OutType, InFP, OutFP>(outType, inType,
+                                                              sat, round)))
            {
                vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
                           "FAILED ** \n",
@@ -529,8 +625,7 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
    }
 }

-
-template <typename InType, typename OutType>
+template <typename InType, typename OutType, bool InFP, bool OutFP>
 int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
                            RoundingMode round)
 {
@@ -541,7 +636,7 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
    cl_uint threads = GetThreadCount();

    DataInitInfo info = { 0, 0, outType, inType, sat, round, threads };
-    DataInfoSpec<InType, OutType> init_info(info);
+    DataInfoSpec<InType, OutType, InFP, OutFP> init_info(info);
    WriteInputBufferInfo writeInputBufferInfo;
    int vectorSize;
    int error = 0;
@@ -564,7 +659,7 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
    {
        writeInputBufferInfo.calcInfo[vectorSize].reset(
-            new CalcRefValsPat<InType, OutType>());
+            new CalcRefValsPat<InType, OutType, InFP, OutFP>());
        writeInputBufferInfo.calcInfo[vectorSize]->program =
            conv_test::MakeProgram(
                outType, inType, sat, round, vectorSize,
@@ -597,6 +692,11 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
        if (round == kDefaultRoundingMode && gIsRTZ)
            init_info.round = round = kRoundTowardZero;
    }
+    else if (std::is_same<OutType, cl_half>::value && OutFP)
+    {
+        if (round == kDefaultRoundingMode && gIsHalfRTZ)
+            init_info.round = round = kRoundTowardZero;
+    }

    // Figure out how many elements are in a work block
    // we handle 64-bit types a bit differently.
@@ -764,6 +864,10 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
                        vlog("Input value: 0x%8.8x ",
                             ((unsigned int *)gIn)[error - 1]);
                        break;
+                    case khalf:
+                        vlog("Input value: %a ",
+                             HTF(((cl_half *)gIn)[error - 1]));
+                        break;
                    case kfloat:
                        vlog("Input value: %a ", ((float *)gIn)[error - 1]);
                        break;
@@ -901,16 +1005,6 @@ double SubtractTime(uint64_t endTime, uint64_t startTime)
 }
 #endif

-////////////////////////////////////////////////////////////////////////////////
-
-static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
-{
-    cl_uint i;
-    for (i = 0; i < count; ++i)
-        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
-}
-
-
 void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &ptr);

 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
@@ -951,6 +1045,112 @@ void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &info)
    // destroyed automatically soon after we exit.
 }

+template <typename T> static bool isnan_fp(const T &v)
+{
+    if (std::is_same<T, cl_half>::value)
+    {
+        uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+        uint16_t h_mant = ((cl_half)v) & 0x3FF;
+        return (h_exp == 0x1F && h_mant != 0);
+    }
+    else
+    {
+#if !defined(_WIN32)
+        return std::isnan(v);
+#else
+        return _isnan(v);
+#endif
+    }
+}
+
+template <typename InType>
+void ZeroNanToIntCases(cl_uint count, void *mapped, Type outType)
+{
+    InType *inp = (InType *)gIn;
+    for (auto j = 0; j < count; j++)
+    {
+        if (isnan_fp<InType>(inp[j]))
+            memset((char *)mapped + j * gTypeSizes[outType], 0,
+                   gTypeSizes[outType]);
+    }
+}
+
+template <typename InType, typename OutType>
+void FixNanToFltConversions(InType *inp, OutType *outp, cl_uint count)
+{
+    if (std::is_same<OutType, cl_half>::value)
+    {
+        for (auto j = 0; j < count; j++)
+            if (isnan_fp(inp[j]) && isnan_fp(outp[j]))
+                outp[j] = 0x7e00; // HALF_NAN
+    }
+    else
+    {
+        for (auto j = 0; j < count; j++)
+            if (isnan_fp(inp[j]) && isnan_fp(outp[j])) outp[j] = NAN;
+    }
+}
+
+void FixNanConversions(Type outType, Type inType, void *d, cl_uint count)
+{
+    if (outType != kfloat && outType != kdouble && outType != khalf)
+    {
+        if (inType == kfloat)
+            ZeroNanToIntCases<float>(count, d, outType);
+        else if (inType == kdouble)
+            ZeroNanToIntCases<double>(count, d, outType);
+        else if (inType == khalf)
+            ZeroNanToIntCases<cl_half>(count, d, outType);
+    }
+    else if (inType == kfloat || inType == kdouble || inType == khalf)
+    {
+        // outtype and intype is float or double or half.  NaN conversions for
+        // float/double/half could be any NaN
+        if (inType == kfloat)
+        {
+            float *inp = (float *)gIn;
+            if (outType == kdouble)
+            {
+                double *outp = (double *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+            else if (outType == khalf)
+            {
+                cl_half *outp = (cl_half *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+        }
+        else if (inType == kdouble)
+        {
+            double *inp = (double *)gIn;
+            if (outType == kfloat)
+            {
+                float *outp = (float *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+            else if (outType == khalf)
+            {
+                cl_half *outp = (cl_half *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+        }
+        else if (inType == khalf)
+        {
+            cl_half *inp = (cl_half *)gIn;
+            if (outType == kfloat)
+            {
+                float *outp = (float *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+            else if (outType == kdouble)
+            {
+                double *outp = (double *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+        }
+    }
+}
+

 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
                                             void *data)
@@ -963,7 +1163,6 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
    Type outType =
        info->parent->outType; // the data type of the conversion result
    Type inType = info->parent->inType; // the data type of the conversion input
-    size_t j;
    cl_int error;
    cl_event doneBarrier = info->parent->doneBarrier;

@@ -985,51 +1184,7 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,

    // Patch up NaNs conversions to integer to zero -- these can be converted to
    // any integer
-    if (outType != kfloat && outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)gIn;
-            double *outp = (double *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)gIn;
-            float *outp = (float *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-    }
+    FixNanConversions(outType, inType, mapped, count);

    if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
        info->result =
@@ -1077,12 +1232,8 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
    // CalcReferenceValuesComplete exit.
 }

-//
-
 namespace conv_test {

-////////////////////////////////////////////////////////////////////////////////
-
 cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
 {
    DataInitBase *info = (DataInitBase *)p;
@@ -1092,8 +1243,6 @@ cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
    return CL_SUCCESS;
 }

-////////////////////////////////////////////////////////////////////////////////
-
 cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
 {
    DataInitBase *info = (DataInitBase *)p;
@@ -1102,7 +1251,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
    Type inType = info->inType;
    Type outType = info->outType;
    RoundingMode round = info->round;
-    size_t j;

    Force64BitFPUPrecision();

@@ -1110,7 +1258,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
    void *a = (cl_uchar *)gAllowZ + job_id * count;
    void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];

-
    if (outType != inType)
    {
        // create the reference while we wait
@@ -1144,7 +1291,33 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
        qcom_sat = info->sat;
 #endif

-        RoundingMode oldRound = set_round(round, outType);
+        RoundingMode oldRound;
+        if (outType == khalf)
+        {
+            oldRound = set_round(kRoundToNearestEven, kfloat);
+            switch (round)
+            {
+                default:
+                case kDefaultRoundingMode:
+                    DataInitInfo::halfRoundingMode =
+                        ConversionsTest::defaultHalfRoundingMode;
+                    break;
+                case kRoundToNearestEven:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTE;
+                    break;
+                case kRoundUp:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTP;
+                    break;
+                case kRoundDown:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTN;
+                    break;
+                case kRoundTowardZero:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTZ;
+                    break;
+            }
+        }
+        else
+            oldRound = set_round(round, outType);

        if (info->sat)
            info->conv_array_sat(d, s, count);
@@ -1156,10 +1329,13 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
        // Decide if we allow a zero result in addition to the correctly rounded
        // one
        memset(a, 0, count);
-        if (gForceFTZ)
+        if (gForceFTZ && (inType == kfloat || outType == kfloat))
        {
-            if (inType == kfloat || outType == kfloat)
-                setAllowZ((uint8_t *)a, (uint32_t *)s, count);
+            info->set_allow_zero_array((uint8_t *)a, d, s, count);
+        }
+        if (gForceHalfFTZ && (inType == khalf || outType == khalf))
+        {
+            info->set_allow_zero_array((uint8_t *)a, d, s, count);
        }
    }
    else
@@ -1170,55 +1346,11 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)

    // Patch up NaNs conversions to integer to zero -- these can be converted to
    // any integer
-    if (info->outType != kfloat && info->outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((double *)d)[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((float *)d)[j] = NAN;
-            }
-        }
-    }
+    FixNanConversions(outType, inType, d, count);

    return CL_SUCCESS;
 }

-////////////////////////////////////////////////////////////////////////////////
-
 uint64_t GetTime(void)
 {
 #if defined(__APPLE__)
@@ -1233,8 +1365,6 @@ uint64_t GetTime(void)
 #endif
 }

-////////////////////////////////////////////////////////////////////////////////
-
 // Note: not called reentrantly
 void WriteInputBufferComplete(void *data)
 {
@@ -1295,8 +1425,6 @@ void WriteInputBufferComplete(void *data)
    // automatically soon after we exit.
 }

-////////////////////////////////////////////////////////////////////////////////
-
 cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
                       RoundingMode round, int vectorSize, cl_kernel *outKernel)
 {
@@ -1308,6 +1436,9 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
    if (outType == kdouble || inType == kdouble)
        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";

+    if (outType == khalf || inType == khalf)
+        source << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
    // Create the program. This is a bit complicated because we are trying to
    // avoid byte and short stores.
    if (0 == vectorSize)
@@ -1408,7 +1539,7 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
    *outKernel = NULL;

    const char *flags = NULL;
-    if (gForceFTZ) flags = "-cl-denorms-are-zero";
+    if (gForceFTZ || gForceHalfFTZ) flags = "-cl-denorms-are-zero";

    // build it
    std::string sourceString = source.str();