Initial open source release of OpenCL 2.0 CTS.

2026-03-23 15:39:03 +00:00 · 2017-05-16 18:50:35 +05:30
parent 6911ba5116
commit 3a440d17c8
883 changed files with 318212 additions and 0 deletions
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -0,0 +1,76 @@
+set(MODULE_NAME BRUTEFORCE)
+
+set(${MODULE_NAME}_SOURCES
+    FunctionList.c
+    Sleep.c
+    binary.c
+    binaryOperator.c
+    Utility.c
+    binary_i.c
+    binary_two_results_i.c
+    i_unary.c
+    macro_binary.c
+    macro_unary.c
+    mad.c
+    main.c
+    reference_math.c
+    ternary.c
+    unary.c
+    unary_two_results.c
+    unary_two_results_i.c
+    unary_u.c
+    ../../test_common/harness/rounding_mode.c
+    ../../test_common/harness/ThreadPool.c
+    ../../test_common/harness/mt19937.c
+    ../../test_common/harness/msvc9.c
+)
+
+
+if (NOT ANDROID)
+set_source_files_properties(
+    FunctionList.c
+    Sleep.c
+    binary.c
+    binaryOperator.c
+    Utility.c
+    binary_i.c
+    binary_two_results_i.c
+    i_unary.c
+    macro_binary.c
+    macro_unary.c
+    mad.c
+    main.c     reference_math.c
+    ternary.c     unary.c     unary_two_results.c
+    unary_two_results_i.c unary_u.c
+    COMPILE_FLAGS -msse2    )
+endif(NOT ANDROID)
+
+set_source_files_properties(
+    FunctionList.c
+    Sleep.c
+    binary.c
+    binaryOperator.c
+    Utility.c
+    binary_i.c
+    binary_two_results_i.c
+    i_unary.c
+    macro_binary.c
+    macro_unary.c
+    mad.c
+    main.c
+    reference_math.c
+    ternary.c
+    unary.c
+    unary_two_results.c
+    unary_two_results_i.c unary_u.c
+    ../../test_common/harness/rounding_mode.c
+    ../../test_common/harness/ThreadPool.c
+    ../../test_common/harness/msvc9.c
+    PROPERTIES LANGUAGE CXX)
+
+if(CMAKE_COMPILER_IS_GNUCC)
+set_source_files_properties(
+        COMPILE_FLAGS -O0)
+endif(CMAKE_COMPILER_IS_GNUCC)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/math_brute_force/FunctionList.c
+++ b/test_conformance/math_brute_force/FunctionList.c
@@ -0,0 +1,180 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "FunctionList.h"
+#include "reference_math.h"
+
+#define FTZ_ON  1
+#define FTZ_OFF 0
+#define EXACT    0.0f
+#define RELAXED_ON 1
+#define RELAXED_OFF 0
+
+#define STRINGIFY( _s)                  #_s
+
+#define ENTRY(      _name, _ulp, _embedded_ulp, _rmode, _type )                 { STRINGIFY(_name), STRINGIFY(_name),                 {(void*)reference_##_name}, {(void*)reference_##_name##l}, {(void*)reference_##_name},           _ulp, _ulp, _embedded_ulp, INFINITY,     _rmode, RELAXED_OFF, _type }
+#define ENTRY_EXT(  _name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type )   { STRINGIFY(_name), STRINGIFY(_name),                 {(void*)reference_##_name}, {(void*)reference_##_name##l}, {(void*)reference_##relaxed_##_name}, _ulp, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, RELAXED_ON,  _type }
+#define HALF_ENTRY( _name, _ulp, _embedded_ulp, _rmode, _type )                 { "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), {(void*)reference_##_name}, {NULL}, {NULL},                   _ulp, _ulp, _embedded_ulp, INFINITY, _rmode, RELAXED_OFF, _type }
+#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type)    { STRINGIFY(_name), _operator,                        {(void*)reference_##_name}, {(void*)reference_##_name##l}, {NULL},                               _ulp, _ulp, _embedded_ulp, INFINITY,     _rmode, RELAXED_OFF, _type }
+
+#if defined( __cplusplus )
+    extern "C" {
+#endif
+extern const vtbl _unary;               // float foo( float )
+extern const vtbl _unary_u;             // float foo( uint ),  double foo( ulong )
+extern const vtbl _i_unary;             // int foo( float )
+extern const vtbl _macro_unary;         // int foo( float ),  returns {0,1} for scalar, { 0, -1 } for vector
+extern const vtbl _binary;              // float foo( float, float )
+extern const vtbl _binary_nextafter;    // float foo( float, float ), special handling for nextafter
+extern const vtbl _binary_operator;     // float .op. float
+extern const vtbl _macro_binary;        // int foo( float, float ), returns {0,1} for scalar, { 0, -1 } for vector
+extern const vtbl _binary_i;            // float foo( float, int )
+extern const vtbl _ternary;             // float foo( float, float, float )
+extern const vtbl _unary_two_results;   // float foo( float, float * )
+extern const vtbl _unary_two_results_i; // float foo( float, int * )
+extern const vtbl _binary_two_results_i; // float foo( float, float, int * )
+extern const vtbl _mad_tbl;             // float mad( float, float, float )
+#if defined( __cplusplus)
+    }
+#endif
+
+#define unaryF &_unary
+#define i_unaryF &_i_unary
+#define unaryF_u  &_unary_u
+#define macro_unaryF &_macro_unary
+#define binaryF &_binary
+#define binaryF_nextafter &_binary_nextafter
+#define binaryOperatorF &_binary_operator
+#define binaryF_i &_binary_i
+#define macro_binaryF &_macro_binary
+#define ternaryF &_ternary
+#define unaryF_two_results  &_unary_two_results
+#define unaryF_two_results_i  &_unary_two_results_i
+#define binaryF_two_results_i  &_binary_two_results_i
+#define mad_function        &_mad_tbl
+
+
+const Func  functionList[] = {
+                                    ENTRY( acos,                  4.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( acosh,                 4.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( acospi,                5.0f,         5.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( asin,                  4.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( asinh,                 4.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( asinpi,                5.0f,         5.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( atan,                  5.0f,         5.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( atanh,                 5.0f,         5.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( atanpi,                5.0f,         5.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( atan2,                 6.0f,         6.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY( atan2pi,               6.0f,         6.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY( cbrt,                  2.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( ceil,                  0.0f,         0.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( copysign,              0.0f,         0.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY_EXT( cos,               4.0f,         4.0f,        0.00048828125f,        FTZ_OFF,     unaryF), //relaxed ulp 2^-11
+                                    ENTRY( cosh,                  4.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( cospi,                 4.0f,         4.0f,         FTZ_OFF,     unaryF),
+//                                  ENTRY( erfc,                  16.0f,         16.0f,         FTZ_OFF,     unaryF), //disabled for 1.0 due to lack of reference implementation
+//                                  ENTRY( erf,                   16.0f,         16.0f,         FTZ_OFF,     unaryF), //disabled for 1.0 due to lack of reference implementation
+                                    ENTRY_EXT( exp,               3.0f,         4.0f,       3.0f,       FTZ_OFF,    unaryF), //relaxed error is actually overwritten in unary.c as it is 3+floor(fabs(2*x))
+                                    ENTRY_EXT( exp2,              3.0f,         4.0f,       3.0f,       FTZ_OFF,    unaryF), //relaxed error is actually overwritten in unary.c as it is 3+floor(fabs(2*x))
+                                    ENTRY_EXT( exp10,             3.0f,         4.0f,       8192.0f,    FTZ_OFF,    unaryF), //relaxed error is actually overwritten in unary.c as it is 3+floor(fabs(2*x)) in derived mode,
+                                    // in non-derived mode it uses the ulp error for half_exp10.
+                                    ENTRY( expm1,                 3.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( fabs,                  0.0f,         0.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( fdim,                  0.0f,         0.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY( floor,                 0.0f,         0.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( fma,                   0.0f,         0.0f,         FTZ_OFF,     ternaryF),
+                                    ENTRY( fmax,                  0.0f,         0.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY( fmin,                  0.0f,         0.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY( fmod,                  0.0f,         0.0f,         FTZ_OFF,     binaryF ),
+                                    ENTRY( fract,                 0.0f,         0.0f,         FTZ_OFF,     unaryF_two_results),
+                                    ENTRY( frexp,                 0.0f,         0.0f,         FTZ_OFF,     unaryF_two_results_i),
+                                    ENTRY( hypot,                 4.0f,         4.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY( ilogb,                 0.0f,         0.0f,         FTZ_OFF,     i_unaryF),
+                                    ENTRY( isequal,               0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( isfinite,              0.0f,         0.0f,         FTZ_OFF,     macro_unaryF),
+                                    ENTRY( isgreater,             0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( isgreaterequal,        0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( isinf,                 0.0f,         0.0f,         FTZ_OFF,     macro_unaryF),
+                                    ENTRY( isless,                0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( islessequal,           0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( islessgreater,         0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( isnan,                 0.0f,         0.0f,         FTZ_OFF,     macro_unaryF),
+                                    ENTRY( isnormal,              0.0f,         0.0f,         FTZ_OFF,     macro_unaryF),
+                                    ENTRY( isnotequal,            0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( isordered,             0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( isunordered,           0.0f,         0.0f,         FTZ_OFF,     macro_binaryF),
+                                    ENTRY( ldexp,                 0.0f,         0.0f,         FTZ_OFF,     binaryF_i),
+                                    ENTRY( lgamma,            INFINITY,     INFINITY,         FTZ_OFF,     unaryF),
+                                    ENTRY( lgamma_r,          INFINITY,     INFINITY,         FTZ_OFF,     unaryF_two_results_i),
+                                    ENTRY_EXT( log,               3.0f,         4.0f,       4.76837158203125e-7f,   FTZ_OFF,    unaryF), //relaxed ulp 2^-21
+                                    ENTRY_EXT( log2,              3.0f,         4.0f,       4.76837158203125e-7f,   FTZ_OFF,    unaryF), //relaxed ulp 2^-21
+                                    ENTRY( log10,                 3.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( log1p,                 2.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( logb,                  0.0f,         0.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY_EXT( mad,           INFINITY,     INFINITY,        INFINITY,    FTZ_OFF,    mad_function), //in fast-relaxed-math mode it has to be either exactly rounded fma or exactly rounded a*b+c
+                                    ENTRY( maxmag,                0.0f,         0.0f,         FTZ_OFF,    binaryF ),
+                                    ENTRY( minmag,                0.0f,         0.0f,         FTZ_OFF,    binaryF ),
+                                    ENTRY( modf,                  0.0f,         0.0f,         FTZ_OFF,     unaryF_two_results ),
+                                    ENTRY( nan,                   0.0f,         0.0f,         FTZ_OFF,     unaryF_u),
+                                    ENTRY( nextafter,             0.0f,         0.0f,         FTZ_OFF,     binaryF_nextafter),
+                                    ENTRY_EXT( pow,              16.0f,        16.0f,         8192.0f,     FTZ_OFF,    binaryF), //in derived mode the ulp error is calculated as exp2(y*log2(x)) and in non-derived it is the same as half_pow
+                                    ENTRY( pown,                 16.0f,        16.0f,         FTZ_OFF,     binaryF_i),
+                                    ENTRY( powr,                 16.0f,        16.0f,         FTZ_OFF,     binaryF),
+//                                  ENTRY( reciprocal,            1.0f,         1.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( remainder,             0.0f,         0.0f,         FTZ_OFF,     binaryF),
+                                    ENTRY( remquo,                0.0f,         0.0f,         FTZ_OFF,     binaryF_two_results_i),
+                                    ENTRY( rint,                  0.0f,         0.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( rootn,                16.0f,        16.0f,         FTZ_OFF,     binaryF_i),
+                                    ENTRY( round,                 0.0f,         0.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( rsqrt,                 2.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( signbit,               0.0f,         0.0f,         FTZ_OFF,     macro_unaryF),
+                                    ENTRY_EXT( sin,               4.0f,         4.0f,  0.00048828125f,     FTZ_OFF,    unaryF), //relaxed ulp 2^-11
+                                    ENTRY_EXT( sincos,            4.0f,         4.0f,  0.00048828125f,     FTZ_OFF,    unaryF_two_results), //relaxed ulp 2^-11
+                                    ENTRY( sinh,                  4.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( sinpi,                 4.0f,         4.0f,         FTZ_OFF,     unaryF),
+                                    { "sqrt", "sqrt",     {(void*)reference_sqrt}, {(void*)reference_sqrtl}, {NULL}, 3.0f, 0.0f,    4.0f, INFINITY, FTZ_OFF, RELAXED_OFF, unaryF },
+                                    { "sqrt_cr", "sqrt",  {(void*)reference_sqrt}, {(void*)reference_sqrtl}, {NULL}, 0.0f, 0.0f,    0.0f, INFINITY, FTZ_OFF, RELAXED_OFF, unaryF },
+                                    ENTRY_EXT( tan,               5.0f,         5.0f,         8192.0f,    FTZ_OFF,     unaryF), //in derived mode it the ulp error is calculated as sin/cos and in non-derived mode it is the same as half_tan.
+                                    ENTRY( tanh,                  5.0f,         5.0f,         FTZ_OFF,     unaryF),
+                                    ENTRY( tanpi,                 6.0f,         6.0f,         FTZ_OFF,     unaryF),
+//                                    ENTRY( tgamma,                 16.0f,         16.0f,         FTZ_OFF,     unaryF), // Commented this out until we can be sure this requirement is realistic
+                                    ENTRY( trunc,                 0.0f,         0.0f,         FTZ_OFF,     unaryF),
+
+                                    HALF_ENTRY( cos,           8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( divide,        8192.0f,      8192.0f,          FTZ_ON,     binaryF),
+                                    HALF_ENTRY( exp,           8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( exp2,          8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( exp10,         8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( log,           8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( log2,          8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( log10,         8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( powr,          8192.0f,      8192.0f,          FTZ_ON,     binaryF),
+                                    HALF_ENTRY( recip,         8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( rsqrt,         8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( sin,           8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( sqrt,          8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+                                    HALF_ENTRY( tan,           8192.0f,      8192.0f,          FTZ_ON,     unaryF),
+
+                                    // basic operations
+                                    OPERATOR_ENTRY( add, "+",         0.0f,         0.0f,     FTZ_OFF,     binaryOperatorF),
+                                    OPERATOR_ENTRY( subtract, "-",     0.0f,         0.0f,     FTZ_OFF,     binaryOperatorF),
+                                    { "divide", "/",  {(void*)reference_divide}, {(void*)reference_dividel}, {(void*)reference_relaxed_divide}, 2.5f, 0.0f,         3.0f, 2.5f, FTZ_OFF, RELAXED_ON, binaryOperatorF },
+                                    { "divide_cr", "/",  {(void*)reference_divide}, {(void*)reference_dividel}, {(void*)reference_relaxed_divide}, 0.0f, 0.0f,         0.0f, 0.f, FTZ_OFF, RELAXED_OFF, binaryOperatorF },
+                                    OPERATOR_ENTRY( multiply, "*",     0.0f,         0.0f,     FTZ_OFF,     binaryOperatorF),
+                                    OPERATOR_ENTRY( assignment, "", 0.0f,       0.0f,     FTZ_OFF,     unaryF),        // A simple copy operation
+                                    OPERATOR_ENTRY( not, "!",       0.0f,       0.0f,   FTZ_OFF,    macro_unaryF),
+                                };
+
+const size_t functionListCount = sizeof( functionList ) / sizeof( functionList[0] );
--- a/test_conformance/math_brute_force/FunctionList.h
+++ b/test_conformance/math_brute_force/FunctionList.h
@@ -0,0 +1,99 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef FUNCTIONLIST_H
+#define FUNCTIONLIST_H
+
+#include "../../test_common/harness/compat.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#if defined( __APPLE__ )
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/cl.h>
+#endif
+
+#include "../../test_common/harness/mt19937.h"
+
+typedef union fptr
+{
+    void    *p;
+    double  (*f_f)(double);
+    double  (*f_u)(cl_uint);
+    int     (*i_f)(double);
+    int     (*i_f_f)(float);
+    float   (*f_ff_f)(float, float);
+    double  (*f_ff)(double, double);
+    int     (*i_ff)(double, double);
+    double  (*f_fi)(double, int);
+    double  (*f_fpf)(double, double*);
+    double  (*f_fpI)(double, int*);
+    double  (*f_ffpI)(double, double, int*);
+    double  (*f_fff)(double, double, double );
+    float   (*f_fma)(float, float, float, int);
+}fptr;
+
+typedef union dptr
+{
+    void            *p;
+    long double     (*f_f)(long double);
+    long double     (*f_u)(cl_ulong);
+    int             (*i_f)(long double);
+    long double     (*f_ff)(long double, long double);
+    int             (*i_ff)(long double, long double);
+    long double     (*f_fi)(long double, int);
+    long double     (*f_fpf)(long double, long double*);
+    long double     (*f_fpI)(long double, int*);
+    long double     (*f_ffpI)(long double, long double, int*);
+    long double     (*f_fff)(long double, long double, long double);
+}dptr;
+
+struct Func;
+
+typedef struct vtbl
+{
+    const char  *type_name;
+    int         (*TestFunc)( const struct Func *, MTdata );
+    int         (*DoubleTestFunc)( const struct Func *, MTdata);        // may be NULL if function is single precision only
+}vtbl;
+
+typedef struct Func
+{
+  const char      *name;              // common name, to be used as an argument in the shell
+  const char      *nameInCode;        // name as it appears in the __kernel, usually the same as name, but different for multiplication
+  fptr            func;
+  dptr            dfunc;
+  fptr            rfunc;
+  float           float_ulps;
+  float           double_ulps;
+  float           float_embedded_ulps;
+  float           relaxed_error;
+  int             ftz;
+  int             relaxed;
+  const ::vtbl    *vtbl;
+}Func;
+
+
+extern const Func  functionList[];
+
+extern const size_t functionListCount;
+
+
+#endif
+
+
--- a/test_conformance/math_brute_force/Jamfile
+++ b/test_conformance/math_brute_force/Jamfile
@@ -0,0 +1,36 @@
+project
+    : requirements
+      -<library>/harness//harness <use>/harness//harness
+      <library>/Runtime//OpenCL.lib
+#      <toolset>gcc:<cflags>-xc++
+      <toolset>msvc:<cflags>"/TP"
+    ;
+
+exe bruteforce
+    : binary.c
+      binary_i.c
+      binaryOperator.c
+      binary_two_results_i.c
+      FunctionList.c
+      i_unary.c
+      macro_binary.c
+      macro_unary.c
+      mad.c
+      main.c
+      reference_math.c
+      Sleep.c
+      ternary.c
+      unary.c
+      unary_two_results.c
+      unary_two_results_i.c
+      unary_u.c
+      Utility.c
+      /harness//mt19937.c
+    : <target-os>windows:<source>/harness//msvc9.c
+    ;
+
+install dist
+    : bruteforce
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance/math_brute_force
+      <variant>release:<location>$(DIST)/release/tests/test_conformance/math_brute_force
+    ;
--- a/test_conformance/math_brute_force/Makefile
+++ b/test_conformance/math_brute_force/Makefile
@@ -0,0 +1,33 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+CC = cc
+CFLAGS = -g -Wall -Wshorten-64-to-32 $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF}
+LIBRARIES = -framework OpenCL -framework ApplicationServices -framework IOKit -I/System/Library/Frameworks/OpenCL.framework/Headers ${RC_CFLAGS} ${ATF}
+
+release:
+	echo "Build Release"
+	$(CC) *.c ../../test_common/harness/mt19937.c ../../test_common/harness/rounding_mode.c ../../test_common/harness/ThreadPool.c -Os $(CFLAGS) -o bruteforce $(LIBRARIES)
+
+debug:
+	echo "Build Debug"
+	$(CC) *.c ../../test_common/harness/mt19937.c ../../test_common/harness/rounding_mode.c ../../test_common/harness/ThreadPool.c -O0 $(CFLAGS) -D_DEBUG=1 -o bruteforce_debug  $(LIBRARIES)
+
+test:	release
+	arch -i386 ./bruteforce -c > cpu.log	&
+	arch -i386 ./bruteforce -g > gpu.log	&
+	echo "Testing 32-bit mode in progress. This may take up to 1 day to complete. See cpu.log and gpu.log for results."
+
+test64:	release
+	arch -x86_64 ./bruteforce -c > cpu64.log	&
+	arch -x86_64 ./bruteforce -g > gpu64.log	&
+	echo "Testing 64-bit mode in progress. This may take up to 1 day to complete. See cpu64.log and gpu64.log for results."
+
+
+clean:
+	rm -f ./bruteforce_debug
+	rm -f ./bruteforce
+
+all:	release 
--- a/test_conformance/math_brute_force/README.txt
+++ b/test_conformance/math_brute_force/README.txt
@@ -0,0 +1,150 @@
+Copyright:	(c) 2009-2013 by Apple Inc. All Rights Reserved.
+
+math_brute_force test                                                   Feb 24, 2009
+=====================
+
+Usage:
+
+        Please run the executable with --help for usage information.
+	
+
+
+System Requirements:
+
+        This test requires support for correctly rounded single and double precision arithmetic.
+The current version also requires a reasonably accurate operating system math library to 
+be present. The OpenCL implementation must be able to compile kernels online. The test assumes
+that the host system stores its floating point data according to the IEEE-754 binary single and 
+double precision floating point formats. 
+
+
+Test Completion Time:
+
+        This test takes a while. Modern desktop systems can usually finish it in 1-3
+days. Engineers doing OpenCL math library software development may find wimpy mode (-w)
+a useful screen to quickly look for problems in a new implementation, before committing
+to a lengthy test run. Likewise, it is possible to run just a range of tests, or specific
+tests. See Usage above.
+
+
+Test Design:
+
+        This test is designed to do a somewhat exhaustive examination of the single
+and double precision math library functions in OpenCL, for all vector lengths. Math 
+library functions are compared against results from a higher precision reference 
+function to determine correctness. All possible inputs are  examined for unary 
+single precision functions.  Other functions are tested against a table of difficult 
+values, followed by a few billion random values. If an error is found in a function,
+the test for that function terminates early, reports an error, and moves on to the 
+next test, if any.
+
+The test currently doesn't support half precision math functions covered in section 
+9 of the OpenCL 1.0 specification, but does cover the half_func functions covered in 
+section six. It also doesn't test the native_<funcname> functions, for which any result 
+is conformant.  
+
+For the OpenCL 1.0 time frame, the reference library shall be the operating system 
+math library, as modified by the test itself to conform to the OpenCL specification. 
+That will help ensure that all devices on a particular operating system are returning 
+similar results.  Going forward to future OpenCL releases, it is planned to gradually 
+introduce a reference math library directly into the test, so as to reduce inter-
+platform variance between OpenCL implementations. 
+
+Generally speaking, this test will consider a result correct if it is one of the following:
+
+        1) bitwise identical to the output of the reference function, 
+                rounded to the appropriate precision
+
+        2) within the allowed ulp error tolerance of the infinitely precise
+                result (as estimated by the reference function)
+
+        3) If the reference result is a NaN, then any NaN is deemed correct.
+
+        4) if the devices is running in FTZ mode, then the result is also correct
+                if the infinitely precise result (as estimated by the reference
+                function) is subnormal, and the returned result is a zero
+        
+        5) if the devices is running in FTZ mode, then we also calculate the 
+                estimate of the infinitely precise result with the reference function 
+                with subnormal inputs flushed to +- zero.  If any of those results 
+                are within the error tolerance of the returned result, then it is 
+                deemed correct
+
+        6) half_func functions may flush per 4&5 above, even if the device is not
+                in FTZ mode.
+
+        7) Functions are allowed to prematurely overflow to infinity, so long as 
+                the estimated infinitely precise result is within the stated ulp 
+                error limit of the maximum finite representable value of appropriate 
+                sign
+
+        8) Functions are allowed to prematurely underflow (and if in FTZ mode, 
+                have behavior covered by 4&5 above), so long as the estimated
+                infinitely precise result is within the stated ulp error limit
+                of the minimum normal representable value of appropriate sign
+
+        9) Some functions have limited range. Results of inputs outside that range
+                are considered correct, so long as a result is returned.
+
+        10) Some functions have infinite error bounds. Results of these function
+                are considered correct, so long as a result is returned.
+
+        11) The test currently does not discriminate based on the sign of zero
+                We anticipate a later test will.
+
+        12) The test currently does not check to make sure that edge cases called 
+                out in the standard (e.g. pow(1.0, any) = 1.0) are exactly correct.
+                We anticipate a later test will.
+
+        13) The test doesn't check IEEE flags or exceptions. See section 7.3 of the 
+                OpenCL standard.
+
+
+
+Performance Measurement:
+
+        There is also some optional timing code available, currently turned off by default. 
+These may be useful for tracking internal performance regressions, but is not required to 
+be part of the conformance submission.
+
+
+If the test is believed to be in error:
+
+The above correctness heuristics shall not be construed to be an alternative to the correctness 
+criteria established by the OpenCL standard. An implementation shall be judged correct
+or not on appeal based on whether it is within prescribed error bounds of the infinitely 
+precise result. (The ulp is defined in section 7.4 of the spec.) If the input value corresponds
+to an edge case listed in OpenCL specification sections covering edge case behavior, or 
+similar sections in the C99 TC2 standard (section F.9 and G.6), the the function shall return
+exactly that result, and the sign of a zero result shall be correct. In the event that the test 
+is found to be faulty, resulting in a spurious failure result, the committee shall make a reasonable 
+attempt to fix the test. If no practical and timely remedy can be found, then the implementation 
+shall be granted a waiver.
+
+
+Guidelines for reference function error tolerances:
+
+        Errors are measured in ulps, and stored in a single precision representation. So as
+to avoid introducing error into the error measurement due to error in the reference function
+itself, the reference function should attempt to deliver 24 bits more precision than the test 
+function return type. (All functions are currently either required to be correctly rounded or 
+may have >= 1 ulp of error. This places the 1's bit at the LSB of the result, with 23 bits of 
+sub-ulp accuracy. One more bit is required to avoid accrual of extra error due to round-to-
+nearest behavior. If we start to require sub-ulp precision, then the accuracy requirements 
+for reference functions increase.) Therefore reference functions for single precision should 
+have 24+24=48 bits of accuracy, and reference functions for double precision should ideally 
+have 53+24 = 77 bits of accuracy. 
+
+A double precision system math library function should be sufficient to safely verify a single 
+precision OpenCL math library function.  A long double precision math library function may or 
+may not be sufficient to verify a double precision OpenCL math library function, depending on 
+the precision of the long double type. A later version of these tests is expected to replace 
+long double with a head+tail double double representation that can represent sufficient precision,
+on all platforms that support double. 
+
+
+Revision history:
+
+ Feb 24, 2009                IRO        Created README
+                                        Added some reference functions so the test will run on Windows.
+
--- a/test_conformance/math_brute_force/Sleep.c
+++ b/test_conformance/math_brute_force/Sleep.c
@@ -0,0 +1,118 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "Sleep.h"
+#include "Utility.h"
+
+#if defined( __APPLE__ )
+    #include <IOKit/pwr_mgt/IOPMLib.h>
+    #include <IOKit/IOMessage.h>
+
+    struct
+    {
+        io_connect_t            connection;
+        IONotificationPortRef    port;
+        io_object_t                iterator;
+    }sleepInfo;
+
+    void sleepCallback(    void *            refcon,
+                        io_service_t        service,
+                        natural_t        messageType,
+                        void *            messageArgument );
+
+    void sleepCallback(    void *            refcon UNUSED,
+                        io_service_t        service UNUSED,
+                        natural_t        messageType,
+                        void *            messageArgument )
+    {
+
+        IOReturn result;
+    /*
+    service -- The IOService whose state has changed.
+    messageType -- A messageType enum, defined by IOKit/IOMessage.h or by the IOService's family.
+    messageArgument -- An argument for the message, dependent on the messageType.
+    */
+        switch ( messageType )
+        {
+            case kIOMessageSystemWillSleep:
+                // Handle demand sleep (such as sleep caused by running out of
+                // batteries, closing the lid of a laptop, or selecting
+                // sleep from the Apple menu.
+                IOAllowPowerChange(sleepInfo.connection,(long)messageArgument);
+                vlog( "Hard sleep occurred.\n" );
+                break;
+            case kIOMessageCanSystemSleep:
+                // In this case, the computer has been idle for several minutes
+                // and will sleep soon so you must either allow or cancel
+                // this notification. Important: if you don’t respond, there will
+                // be a 30-second timeout before the computer sleeps.
+                // IOCancelPowerChange(root_port,(long)messageArgument);
+                result = IOCancelPowerChange(sleepInfo.connection,(long)messageArgument);
+                if( kIOReturnSuccess != result )
+                    vlog( "sleep prevention failed. (%d)\n", result);
+            break;
+            case kIOMessageSystemHasPoweredOn:
+                // Handle wakeup.
+                break;
+        }
+    }
+#endif
+
+
+
+
+
+void PreventSleep( void )
+{
+#if defined( __APPLE__ )
+    vlog( "Disabling sleep... " );
+    sleepInfo.iterator = (io_object_t) 0;
+    sleepInfo.port = NULL;
+    sleepInfo.connection = IORegisterForSystemPower
+                            (
+                                &sleepInfo,                    //void * refcon,
+                                &sleepInfo.port,            //IONotificationPortRef * thePortRef,
+                                sleepCallback,                //IOServiceInterestCallback callback,
+                                &sleepInfo.iterator            //io_object_t * notifier
+                            );
+
+    if( (io_connect_t) 0 == sleepInfo.connection )
+        vlog( "failed.\n" );
+    else
+        vlog( "done.\n" );
+
+    CFRunLoopAddSource(CFRunLoopGetCurrent(),
+                        IONotificationPortGetRunLoopSource(sleepInfo.port),
+                        kCFRunLoopDefaultMode);
+#else
+    vlog( "*** PreventSleep() is not implemented on this platform.\n" );
+#endif
+}
+
+void ResumeSleep( void )
+{
+#if defined( __APPLE__ )
+    IOReturn result = IODeregisterForSystemPower ( &sleepInfo.iterator );
+    if( 0 != result )
+        vlog( "Got error %d restoring sleep \n", result );
+    else
+        vlog( "Sleep restored.\n" );
+#else
+    vlog( "*** ResumeSleep() is not implemented on this platform.\n" );
+#endif
+}
+
+
+
--- a/test_conformance/math_brute_force/Sleep.h
+++ b/test_conformance/math_brute_force/Sleep.h
@@ -0,0 +1,24 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef SLEEP_H
+#define SLEEP_H
+
+void PreventSleep( void );
+void ResumeSleep( void );
+
+#endif /* SLEEP_H */
+
+
--- a/test_conformance/math_brute_force/Utility.c
+++ b/test_conformance/math_brute_force/Utility.c
@@ -0,0 +1,169 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "Utility.h"
+
+#if defined(__PPC__)
+// Global varaiable used to hold the FPU control register state. The FPSCR register can not
+// be used because not all Power implementations retain or observed the NI (non-IEEE
+// mode) bit.
+__thread fpu_control_t fpu_control = 0;
+#endif
+
+void MulD(double *rhi, double *rlo, double u, double v)
+{
+    const double c = 134217729.0; // 1+2^27
+    double up, u1, u2, vp, v1, v2;
+
+    up = u*c;
+    u1 = (u - up) + up;
+    u2 = u - u1;
+
+    vp = v*c;
+    v1 = (v - vp) + vp;
+    v2 = v - v1;
+
+    double rh = u*v;
+    double rl = (((u1*v1 - rh) + (u1*v2)) + (u2*v1)) + (u2*v2);
+
+    *rhi = rh;
+    *rlo = rl;
+}
+
+void AddD(double *rhi, double *rlo, double a, double b)
+{
+    double zhi, zlo;
+    zhi = a + b;
+    if(fabs(a) > fabs(b)) {
+        zlo = zhi - a;
+        zlo = b - zlo;
+    }
+    else {
+        zlo = zhi - b;
+        zlo = a - zlo;
+    }
+
+    *rhi = zhi;
+    *rlo = zlo;
+}
+
+void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
+{
+    double mh, ml;
+    double c = 134217729.0;
+    double up, u1, u2, vp, v1, v2;
+
+    up = xh*c;
+    u1 = (xh - up) + up;
+    u2 = xh - u1;
+
+    vp = yh*c;
+    v1 = (yh - vp) + vp;
+    v2 = yh - v1;
+
+    mh = xh*yh;
+    ml = (((u1*v1 - mh) + (u1*v2)) + (u2*v1)) + (u2*v2);
+    ml += xh*yl + xl*yh;
+
+    *rhi = mh + ml;
+    *rlo = (mh - (*rhi)) + ml;
+}
+
+void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl)
+{
+    double r, s;
+    r = xh + yh;
+    s = (fabs(xh) > fabs(yh)) ? (xh - r + yh + yl + xl) : (yh - r + xh + xl + yl);
+    *rhi = r + s;
+    *rlo = (r - (*rhi)) + s;
+}
+
+void DivideDD(double *chi, double *clo, double a, double b)
+{
+    *chi = a / b;
+    double rhi, rlo;
+    MulD(&rhi, &rlo, *chi, b);
+    AddDD(&rhi, &rlo, -rhi, -rlo, a, 0.0);
+    *clo = rhi / b;
+}
+
+// These functions comapre two floats/doubles. Since some platforms may choose to
+// flush denormals to zeros before comparison, comparison like a < b may give wrong
+// result in "certain cases" where we do need correct compasion result when operands
+// are denormals .... these functions comapre floats/doubles using signed integer/long int
+// rep. In other cases, when flushing to zeros is fine, these should not be used.
+// Also these doesn't check for nans and assume nans are handled separately as special edge case
+// by the caller which calls these functions
+// return 0 if both are equal, 1 if x > y and -1 if x < y.
+
+inline
+int compareFloats(float x, float y)
+{
+    int32f_t a, b;
+
+    a.f = x;
+    b.f = y;
+
+    if( a.i & 0x80000000 )
+        a.i = 0x80000000 - a.i;
+    if( b.i & 0x80000000 )
+        b.i = 0x80000000 - b.i;
+
+    if( a.i == b.i )
+        return 0;
+
+    return a.i < b.i ? -1 : 1;
+}
+
+inline
+int compareDoubles(double x, double y)
+{
+    int64d_t a, b;
+
+    a.d = x;
+    b.d = y;
+
+    if( a.l & 0x8000000000000000LL )
+        a.l = 0x8000000000000000LL - a.l;
+    if( b.l & 0x8000000000000000LL )
+        b.l = 0x8000000000000000LL - b.l;
+
+    if( a.l == b.l )
+        return 0;
+
+    return a.l < b.l ? -1 : 1;
+}
+
+void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed)
+{
+    char const *fpSizeStr = NULL;
+    char const *fpFastRelaxedStr = "";
+    switch (float_size) {
+    case sizeof(cl_double):
+        fpSizeStr = "fp64";
+        break;
+    case sizeof(cl_float):
+        fpSizeStr = "fp32";
+        break;
+    case sizeof(cl_half):
+        fpSizeStr = "fp16";
+        break;
+    }
+    if (isFastRelaxed) {
+        fpFastRelaxedStr = "rlx";
+    }
+    vlog("%15s %4s %4s",fname, fpSizeStr, fpFastRelaxedStr);
+}
+
--- a/test_conformance/math_brute_force/Utility.h
+++ b/test_conformance/math_brute_force/Utility.h
@@ -0,0 +1,280 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef UTILITY_H
+#define UTILITY_H
+
+#include "../../test_common/harness/compat.h"
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif
+#include <stdio.h>
+#include "../../test_common/harness/rounding_mode.h"
+#include "../../test_common/harness/fpcontrol.h"
+
+#if defined( _WIN32) && defined (_MSC_VER)
+#include "../../test_common/harness/testHarness.h"
+#endif
+
+#include "../../test_common/harness/ThreadPool.h"
+#define BUFFER_SIZE         (1024*1024*2)
+
+#if defined( __GNUC__ )
+    #define UNUSED  __attribute__ ((unused))
+#else
+    #define UNUSED
+#endif
+
+extern int gWimpyBufferSize;
+extern int gWimpyReductionFactor;
+
+#define VECTOR_SIZE_COUNT   6
+extern const char *sizeNames[VECTOR_SIZE_COUNT];
+extern const int   sizeValues[VECTOR_SIZE_COUNT];
+
+extern cl_device_type   gDeviceType;
+extern cl_device_id     gDevice;
+extern cl_context       gContext;
+extern cl_command_queue gQueue;
+extern void             *gIn;
+extern void             *gIn2;
+extern void             *gIn3;
+extern void             *gOut_Ref;
+extern void             *gOut_Ref2;
+extern void             *gOut[VECTOR_SIZE_COUNT];
+extern void             *gOut2[VECTOR_SIZE_COUNT];
+extern cl_mem           gInBuffer;
+extern cl_mem           gInBuffer2;
+extern cl_mem           gInBuffer3;
+extern cl_mem           gOutBuffer[VECTOR_SIZE_COUNT];
+extern cl_mem           gOutBuffer2[VECTOR_SIZE_COUNT];
+extern uint32_t         gComputeDevices;
+extern uint32_t         gSimdSize;
+extern int              gSkipCorrectnessTesting;
+extern int              gMeasureTimes;
+extern int              gReportAverageTimes;
+extern int              gForceFTZ;
+extern volatile int     gTestFastRelaxed;
+extern int              gFastRelaxedDerived;
+extern int              gWimpyMode;
+extern int              gHasDouble;
+extern int              gIsInRTZMode;
+extern int              gInfNanSupport;
+extern int              gIsEmbedded;
+extern int              gVerboseBruteForce;
+extern uint32_t         gMaxVectorSizeIndex;
+extern uint32_t         gMinVectorSizeIndex;
+extern uint32_t         gDeviceFrequency;
+extern cl_device_fp_config gFloatCapabilities;
+extern cl_device_fp_config gDoubleCapabilities;
+
+#define LOWER_IS_BETTER     0
+#define HIGHER_IS_BETTER    1
+
+#if USE_ATF
+
+    #include <ATF/ATF.h>
+    #define test_start()        ATFTestStart()
+    #define test_finish()       ATFTestFinish()
+    #define vlog( ... )         ATFLogInfo(__VA_ARGS__)
+    #define vlog_error( ... )   ATFLogError(__VA_ARGS__)
+    #define vlog_perf( _number, _higherIsBetter, _units, _nameFmt, ... )    ATFLogPerformanceNumber(_number, _higherIsBetter, _units, _nameFmt, __VA_ARGS__ )
+
+#else
+    #include "../../test_common/harness/errorHelpers.h"
+#endif
+
+#if defined (_MSC_VER )
+    //Deal with missing scalbn on windows
+    #define scalbnf( _a, _i )       ldexpf( _a, _i )
+    #define scalbn( _a, _i )        ldexp( _a, _i )
+    #define scalbnl( _a, _i )       ldexpl( _a, _i )
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+float Abs_Error( float test, double reference );
+float Ulp_Error( float test, double reference );
+//float Ulp_Error_Half( float test, double reference );
+float Ulp_Error_Double( double test, long double reference );
+#ifdef __cplusplus
+} //extern "C"
+#endif
+
+uint64_t GetTime( void );
+double SubtractTime( uint64_t endTime, uint64_t startTime );
+int MakeKernel( const char **c, cl_uint count, const char *name, cl_kernel *k, cl_program *p );
+int MakeKernels( const char **c, cl_uint count, const char *name, cl_uint kernel_count, cl_kernel *k, cl_program *p );
+
+// used to convert a bucket of bits into a search pattern through double
+static inline double DoubleFromUInt32( uint32_t bits );
+static inline double DoubleFromUInt32( uint32_t bits )
+{
+    union{ uint64_t u; double d;} u;
+
+    // split 0x89abcdef to 0x89abc00000000def
+    u.u = bits & 0xfffU;
+    u.u |= (uint64_t) (bits & ~0xfffU) << 32;
+
+    // sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s
+    u.u -= (bits & 0x800U) << 1;
+
+    // return result
+    return u.d;
+}
+
+void _LogBuildError( cl_program p, int line, const char *file );
+#define LogBuildError( program )        _LogBuildError( program, __LINE__, __FILE__ )
+
+#define PERF_LOOP_COUNT 100
+
+// Note: though this takes a double, this is for use with single precision tests
+static inline int IsFloatSubnormal( double x )
+{
+#if 2 == FLT_RADIX
+    // Do this in integer to avoid problems with FTZ behavior
+    union{ float d; uint32_t u;}u;
+    u.d = fabsf((float)x);
+    return (u.u-1) < 0x007fffffU;
+#else
+    // rely on floating point hardware for non-radix2 non-IEEE-754 hardware -- will fail if you flush subnormals to zero
+    return fabs(x) < (double) FLT_MIN && x != 0.0;
+#endif
+}
+
+
+static inline int IsDoubleSubnormal( long double x )
+{
+#if 2 == FLT_RADIX
+    // Do this in integer to avoid problems with FTZ behavior
+    union{ double d; uint64_t u;}u;
+    u.d = fabs((double) x);
+    return (u.u-1) < 0x000fffffffffffffULL;
+#else
+    // rely on floating point hardware for non-radix2 non-IEEE-754 hardware -- will fail if you flush subnormals to zero
+    return fabs(x) < (double) DBL_MIN && x != 0.0;
+#endif
+}
+
+//The spec is fairly clear that we may enforce a hard cutoff to prevent premature flushing to zero.
+// However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + ulp_limit to be flushed to zero.
+static inline int IsFloatResultSubnormal( double x, float ulps )
+{
+    x = fabs(x) - MAKE_HEX_DOUBLE( 0x1.0p-149, 0x1, -149) * (double) ulps;
+    return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
+}
+
+static inline int IsFloatResultSubnormalAbsError( double x , float abs_err)
+{
+  x = x - abs_err;
+  return x < MAKE_HEX_DOUBLE( 0x1.0p-126, 0x1, -126 );
+}
+
+static inline int IsDoubleResultSubnormal( long double x, float ulps )
+{
+    x = fabsl(x) - MAKE_HEX_LONG( 0x1.0p-1074, 0x1, -1074) * (long double) ulps;
+    return x < MAKE_HEX_LONG( 0x1.0p-1022, 0x1, -1022 );
+}
+
+static inline int IsFloatInfinity(double x)
+{
+  union { cl_float d; cl_uint u; } u;
+  u.d = (cl_float) x;
+  return ((u.u & 0x7fffffffU) == 0x7F800000U);
+}
+
+static inline int IsFloatMaxFloat(double x)
+{
+  union { cl_float d; cl_uint u; } u;
+  u.d = (cl_float) x;
+  return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU);
+}
+
+static inline int IsFloatNaN(double x)
+{
+  union { cl_float d; cl_uint u; } u;
+  u.d = (cl_float) x;
+  return ((u.u & 0x7fffffffU) > 0x7F800000U);
+}
+
+extern cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
+
+// Windows (since long double got deprecated) sets the x87 to 53-bit precision
+// (that's x87 default state).  This causes problems with the tests that
+// convert long and ulong to float and double or otherwise deal with values
+// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
+static inline void Force64BitFPUPrecision(void)
+{
+#if __MINGW32__
+    // The usual method is to use _controlfp as follows:
+    //     #include <float.h>
+    //     _controlfp(_PC_64, _MCW_PC);
+    //
+    // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
+    // divergent code just use inline assembly which works for both.
+    unsigned short int orig_cw = 0;
+    unsigned short int new_cw = 0;
+    __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
+    new_cw = orig_cw | 0x0300;   // set precision to 64-bit
+    __asm__ __volatile__ ("fldcw  %0"::"m" (new_cw));
+#elif defined( _WIN32 ) && defined( __INTEL_COMPILER )
+    // Unfortunately, usual method (`_controlfp( _PC_64, _MCW_PC );') does *not* work on win.x64:
+    // > On the x64 architecture, changing the floating point precision is not supported.
+    // (Taken from http://msdn.microsoft.com/en-us/library/e9b52ceh%28v=vs.100%29.aspx)
+    int cw;
+    __asm { fnstcw cw };    // Get current value of FPU control word.
+    cw = cw & 0xfffffcff | ( 3 << 8 ); // Set Precision Control to Double Extended Precision.
+    __asm { fldcw cw };     // Set new value of FPU control word.
+#else
+    /* Implement for other platforms if needed */
+#endif
+}
+
+#ifdef __cplusplus
+extern "C"
+#else
+extern
+#endif
+void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
+
+typedef union
+{
+    int32_t i;
+    float   f;
+}int32f_t;
+
+typedef union
+{
+    int64_t l;
+    double  d;
+}int64d_t;
+
+void MulD(double *rhi, double *rlo, double u, double v);
+void AddD(double *rhi, double *rlo, double a, double b);
+void MulDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
+void AddDD(double *rhi, double *rlo, double xh, double xl, double yh, double yl);
+void DivideDD(double *chi, double *clo, double a, double b);
+int compareFloats(float x, float y);
+int compareDoubles(double x, double y);
+
+void logFunctionInfo(const char *fname, unsigned int float_size, unsigned int isFastRelaxed);
+
+#endif /* UTILITY_H */
+
+
--- a/test_conformance/math_brute_force/binary.c
+++ b/test_conformance/math_brute_force/binary.c
--- a/test_conformance/math_brute_force/binaryOperator.c
+++ b/test_conformance/math_brute_force/binaryOperator.c
--- a/test_conformance/math_brute_force/binary_i.c
+++ b/test_conformance/math_brute_force/binary_i.c
--- a/test_conformance/math_brute_force/binary_two_results_i.c
+++ b/test_conformance/math_brute_force/binary_two_results_i.c
--- a/test_conformance/math_brute_force/i_unary.c
+++ b/test_conformance/math_brute_force/i_unary.c
@@ -0,0 +1,629 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "Utility.h"
+
+#include <string.h>
+#include "FunctionList.h"
+
+int TestFunc_Int_Float(const Func *f, MTdata);
+int TestFunc_Int_Double(const Func *f, MTdata);
+
+#if defined( __cplusplus)
+    extern "C"
+#endif
+const vtbl _i_unary = { "i_unary", TestFunc_Int_Float, TestFunc_Int_Double };
+
+
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i] );\n"
+                            "}\n"
+                        };
+    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
+                            "{\n"
+                            "   size_t i = get_global_id(0);\n"
+                            "   if( i + 1 < get_global_size(0) )\n"
+                            "   {\n"
+                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
+                            "       int3 i0 = ", name, "( f0 );\n"
+                            "       vstore3( i0, 0, out + 3*i );\n"
+                            "   }\n"
+                            "   else\n"
+                            "   {\n"
+                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                            "       float3 f0;\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 1:\n"
+                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+                            "               break;\n"
+                            "           case 0:\n"
+                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+                            "               break;\n"
+                            "       }\n"
+                            "       int3 i0 = ", name, "( f0 );\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 0:\n"
+                            "               out[3*i+1] = i0.y; \n"
+                            "               // fall through\n"
+                            "           case 1:\n"
+                            "               out[3*i] = i0.x; \n"
+                            "               break;\n"
+                            "       }\n"
+                            "   }\n"
+                            "}\n"
+                        };
+
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+}
+
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i] );\n"
+                            "}\n"
+                        };
+
+    const char *c3[] = {"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global double* in)\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   if( i + 1 < get_global_size(0) )\n"
+                        "   {\n"
+                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+                        "       int3 i0 = ", name, "( f0 );\n"
+                        "       vstore3( i0, 0, out + 3*i );\n"
+                        "   }\n"
+                        "   else\n"
+                        "   {\n"
+                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                        "       double3 f0;\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 1:\n"
+                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+                        "               break;\n"
+                        "           case 0:\n"
+                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+                        "               break;\n"
+                        "       }\n"
+                        "       int3 i0 = ", name, "( f0 );\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 0:\n"
+                        "               out[3*i+1] = i0.y; \n"
+                        "               // fall through\n"
+                        "           case 1:\n"
+                        "               out[3*i] = i0.x; \n"
+                        "               break;\n"
+                        "       }\n"
+                        "   }\n"
+                        "}\n"
+                    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint     offset;            // the first vector size to build
+    cl_kernel   *kernels;
+    cl_program  *programs;
+    const char  *nameInCode;
+}BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+int TestFunc_Int_Float(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ;
+    size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
+    uint64_t step = bufferSize / sizeof( float );
+    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
+
+    logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+
+    // This test is not using ThreadPool so we need to disable FTZ here
+    // for reference computations
+    FPU_mode_type oldMode;
+    DisableFTZ(&oldMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        return error;
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    for( i = 0; i < (1ULL<<32); i += step )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if( gWimpyMode )
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = (uint32_t) i + j * scale;
+        }
+        else
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = (uint32_t) i + j;
+        }
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            {
+                vlog_error( "FAILED -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        //Calculate the correctly rounded reference result
+        int *r = (int *)gOut_Ref;
+        float *s = (float *)gIn;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            r[j] = f->func.i_f( s[j] );
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+            break;
+
+        //Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] )
+                {
+                    if( ftz && IsFloatSubnormal(s[j]))
+                    {
+                        unsigned int correct0 = f->func.i_f( 0.0 );
+                        unsigned int correct1 = f->func.i_f( -0.0 );
+                        if( q[j] == correct0 || q[j] == correct1 )
+                            continue;
+                    }
+
+                    uint32_t err = t[j] - q[j];
+                    if( q[j] > t[j] )
+                        err = q[j] - t[j];
+                    vlog_error( "\nERROR: %s%s: %d ulp error at %a (0x%8.8x): *%d vs. %d\n", f->name, sizeNames[k], err, ((float*) gIn)[j], ((cl_uint*) gIn)[j], t[j], q[j] );
+                  error = -1;
+                  goto exit;
+                }
+            }
+        }
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+           if (gVerboseBruteForce)
+           {
+               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+           } else
+           {
+              vlog("." );
+           }
+           fflush(stdout);
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            p[j] = genrand_int32(d);
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+        }
+    }
+
+    vlog( "\n" );
+exit:
+    RestoreFPState(&oldMode);
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+int TestFunc_Int_Double(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    int ftz = f->ftz || gForceFTZ;
+    size_t bufferSize = (gWimpyMode)?gWimpyBufferSize:BUFFER_SIZE;
+    uint64_t step = bufferSize / sizeof( cl_double );
+    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
+
+    logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+    // This test is not using ThreadPool so we need to disable FTZ here
+    // for reference computations
+    FPU_mode_type oldMode;
+    DisableFTZ(&oldMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
+                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                &build_info ) ))
+    {
+        return error;
+    }
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    for( i = 0; i < (1ULL<<32); i += step )
+    {
+        //Init input array
+        double *p = (double *)gIn;
+        if( gWimpyMode )
+        {
+            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+                p[j] = DoubleFromUInt32( (uint32_t) i + j * scale );
+        }
+        else
+        {
+            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+                p[j] = DoubleFromUInt32( (uint32_t) i + j );
+        }
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            {
+                vlog_error( "FAILED -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        //Calculate the correctly rounded reference result
+        int *r = (int *)gOut_Ref;
+        double *s = (double *)gIn;
+        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+            r[j] = f->dfunc.i_f( s[j] );
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+            break;
+
+        //Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] )
+                {
+                    if( ftz && IsDoubleSubnormal(s[j]))
+                    {
+                        unsigned int correct0 = f->dfunc.i_f( 0.0 );
+                        unsigned int correct1 = f->dfunc.i_f( -0.0 );
+                        if( q[j] == correct0 || q[j] == correct1 )
+                            continue;
+                    }
+
+                    uint32_t err = t[j] - q[j];
+                    if( q[j] > t[j] )
+                        err = q[j] - t[j];
+                    vlog_error( "\nERROR: %sD%s: %d ulp error at %.13la: *%d vs. %d\n", f->name, sizeNames[k], err, ((double*) gIn)[j], t[j], q[j] );
+                  error = -1;
+                  goto exit;
+                }
+            }
+        }
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+            } else
+            {
+               vlog("." );
+            }
+           fflush(stdout);
+
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        double *p = (double *)gIn;
+        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+            p[j] = DoubleFromUInt32( genrand_int32(d) );
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+        }
+        for( ; j < gMaxVectorSizeIndex; j++ )
+            vlog( "\t     -- " );
+    }
+
+    vlog( "\n" );
+
+
+exit:
+    RestoreFPState(&oldMode);
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+
+
--- a/test_conformance/math_brute_force/macro_binary.c
+++ b/test_conformance/math_brute_force/macro_binary.c
--- a/test_conformance/math_brute_force/macro_unary.c
+++ b/test_conformance/math_brute_force/macro_unary.c
@@ -0,0 +1,965 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "Utility.h"
+
+#include <string.h>
+#include "FunctionList.h"
+
+int TestMacro_Int_Float(const Func *f, MTdata);
+int TestMacro_Int_Double(const Func *f, MTdata);
+
+#if defined( __cplusplus)
+    extern "C"
+#endif
+const vtbl _macro_unary = { "macro_unary", TestMacro_Int_Float, TestMacro_Int_Double };
+
+static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
+static int BuildKernelDouble( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
+
+static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global int", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i] );\n"
+                            "}\n"
+                        };
+    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global int* out, __global float* in)\n"
+                            "{\n"
+                            "   size_t i = get_global_id(0);\n"
+                            "   if( i + 1 < get_global_size(0) )\n"
+                            "   {\n"
+                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
+                            "       int3 i0 = ", name, "( f0 );\n"
+                            "       vstore3( i0, 0, out + 3*i );\n"
+                            "   }\n"
+                            "   else\n"
+                            "   {\n"
+                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                            "       int3 i0;\n"
+                            "       float3 f0;\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 1:\n"
+                            "               f0 = (float3)( in[3*i], 0xdead, 0xdead ); \n"
+                            "               break;\n"
+                            "           case 0:\n"
+                            "               f0 = (float3)( in[3*i], in[3*i+1], 0xdead ); \n"
+                            "               break;\n"
+                            "       }\n"
+                            "       i0 = ", name, "( f0 );\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 0:\n"
+                            "               out[3*i+1] = i0.y; \n"
+                            "               // fall through\n"
+                            "           case 1:\n"
+                            "               out[3*i] = i0.x; \n"
+                            "               break;\n"
+                            "       }\n"
+                            "   }\n"
+                            "}\n"
+                        };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernels(kern, (cl_uint) kernSize, testName, kernel_count, k, p);
+}
+
+static int BuildKernelDouble( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i] );\n"
+                            "}\n"
+                        };
+
+    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global long* out, __global double* in)\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   if( i + 1 < get_global_size(0) )\n"
+                        "   {\n"
+                        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+                        "       long3 l0 = ", name, "( d0 );\n"
+                        "       vstore3( l0, 0, out + 3*i );\n"
+                        "   }\n"
+                        "   else\n"
+                        "   {\n"
+                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                        "       double3 d0;\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 1:\n"
+                        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+                        "               break;\n"
+                        "           case 0:\n"
+                        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+                        "               break;\n"
+                        "       }\n"
+                        "       long3 l0 = ", name, "( d0 );\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 0:\n"
+                        "               out[3*i+1] = l0.y; \n"
+                        "               // fall through\n"
+                        "           case 1:\n"
+                        "               out[3*i] = l0.x; \n"
+                        "               break;\n"
+                        "       }\n"
+                        "   }\n"
+                        "}\n"
+                    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernels(kern, (cl_uint) kernSize, testName, kernel_count, k, p);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint     offset;            // the first vector size to build
+    cl_uint     kernel_count;
+    cl_kernel   **kernels;
+    cl_program  *programs;
+    const char  *nameInCode;
+}BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel( info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i );
+}
+
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble( info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i );
+}
+
+//Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem      inBuf;                              // input buffer for the thread
+    cl_mem      outBuf[ VECTOR_SIZE_COUNT ];        // output buffers for the thread
+    cl_command_queue tQueue;                        // per thread command queue to improve performance
+}ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t      subBufferSize;                      // Size of the sub-buffer in elements
+    const Func  *f;                                 // A pointer to the function info
+    cl_program  programs[ VECTOR_SIZE_COUNT ];      // programs for various vector sizes
+    cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
+    ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
+    cl_uint     threadCount;                        // Number of worker threads
+    cl_uint     step;                               // step between each chunk and the next.
+    cl_uint     scale;                              // stride between individual test values
+    int         ftz;                                // non-zero if running in flush to zero mode
+
+}TestInfo;
+
+static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
+
+int TestMacro_Int_Float(const Func *f, MTdata d)
+{
+    TestInfo    test_info;
+    cl_int      error;
+    size_t      i, j;
+
+    logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
+
+    // Init test_info
+    memset( &test_info, 0, sizeof( test_info ) );
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale =  1;
+    if (gWimpyMode )
+    {
+        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
+    }
+    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    {
+        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
+        test_info.k[i] = (cl_kernel*)malloc( array_size );
+        if( NULL == test_info.k[i] )
+        {
+            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset( test_info.k[i], 0, array_size );
+    }
+    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
+    if( NULL == test_info.tinfo )
+    {
+        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
+    for( i = 0; i < test_info.threadCount; i++ )
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
+        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if( error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            goto exit;
+        }
+
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            {
+                vlog_error( "Error: Unable to create sub-buffer of gOutBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue = clCreateCommandQueueWithProperties(gContext, gDevice, 0, &error);
+        if( NULL == test_info.tinfo[i].tQueue || error )
+        {
+            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode };
+        if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+            goto exit;
+    }
+
+    if( !gSkipCorrectnessTesting )
+    {
+        error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+
+        if( error )
+            goto exit;
+
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
+            p[j] = genrand_int32(d);
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeof( cl_float ) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;    // BUFFER_SIZE / vectorSize  rounded up
+            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+        }
+    }
+
+    vlog( "\n" );
+
+exit:
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if( test_info.k[i] )
+        {
+            for( j = 0; j < test_info.threadCount; j++ )
+                clReleaseKernel(test_info.k[i][j]);
+
+            free( test_info.k[i] );
+        }
+    }
+    if( test_info.tinfo )
+    {
+        for( i = 0; i < test_info.threadCount; i++ )
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free( test_info.tinfo );
+    }
+
+    return error;
+}
+
+static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
+{
+    const TestInfo *job = (const TestInfo *) data;
+    size_t  buffer_elements = job->subBufferSize;
+    size_t  buffer_size = buffer_elements * sizeof( cl_float );
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint) job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr    func = job->f->func;
+    int     ftz = job->ftz;
+    cl_uint j, k;
+    cl_int error = CL_SUCCESS;
+    const char *name = job->f->name;
+
+    int signbit_test = 0;
+    if(!strcmp(name, "signbit"))
+        signbit_test = 1;
+
+    #define ref_func(s) ( signbit_test ? func.i_f_f( s ) : func.i_f( s ) )
+
+    // start the map of the output arrays
+    cl_event e[ VECTOR_SIZE_COUNT ];
+    cl_int  *out[ VECTOR_SIZE_COUNT ];
+    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    {
+        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
+        if( error || NULL == out[j])
+        {
+            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if( (error = clFlush(tinfo->tQueue) ))
+        vlog( "clFlush failed\n" );
+
+    // Write the new values to the input array
+    cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements;
+    for( j = 0; j < buffer_elements; j++ )
+        p[j] = base + j * scale;
+
+    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    {
+        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        return error;
+    }
+
+    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    {
+        //Wait for the map to finish
+        if( (error = clWaitForEvents(1, e + j) ))
+        {
+            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            return error;
+        }
+        if( (error = clReleaseEvent( e[j] ) ))
+        {
+            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        {
+            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
+        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
+
+        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error( "FAILED -- could not execute kernel\n" );
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if( (error = clFlush(tinfo->tQueue) ))
+        vlog( "clFlush 2 failed\n" );
+
+    if( gSkipCorrectnessTesting )
+        return CL_SUCCESS;
+
+    //Calculate the correctly rounded reference result
+    cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
+    float *s = (float *)p;
+    for( j = 0; j < buffer_elements; j++ )
+        r[j] = ref_func( s[j] );
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
+    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    {
+        out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
+        if( error || NULL == out[j] )
+        {
+            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            return error;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (cl_int*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
+    if( error || NULL == out[j] )
+    {
+        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        return error;
+    }
+
+    //Verify data
+    cl_int *t = (cl_int *)r;
+    for( j = 0; j < buffer_elements; j++ )
+    {
+        for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+        {
+            cl_int *q = out[0];
+
+            // If we aren't getting the correctly rounded result
+            if( gMinVectorSizeIndex == 0 && t[j] != q[j])
+            {
+                // If we aren't getting the correctly rounded result
+                if( ftz )
+                {
+                    if( IsFloatSubnormal( s[j]) )
+                    {
+                        int correct = ref_func( +0.0f );
+                        int correct2 = ref_func( -0.0f );
+                        if( correct == q[j] || correct2 == q[j] )
+                            continue;
+                    }
+                }
+
+                uint32_t err = t[j] - q[j];
+                if( q[j] > t[j] )
+                    err = q[j] - t[j];
+                vlog_error( "\nERROR: %s: %d ulp error at %a: *%d vs. %d\n", name,  err, ((float*) s)[j], t[j], q[j] );
+                error = -1;
+                goto exit;
+            }
+
+
+            for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+            {
+                q = out[k];
+                // If we aren't getting the correctly rounded result
+                if( -t[j] != q[j] )
+                {
+                    if( ftz )
+                    {
+                        if( IsFloatSubnormal( s[j]))
+                        {
+                            int correct = -ref_func( +0.0f );
+                            int correct2 = -ref_func( -0.0f );
+                            if( correct == q[j] || correct2 == q[j] )
+                                continue;
+                        }
+                    }
+
+                    uint32_t err = -t[j] - q[j];
+                    if( q[j] > -t[j] )
+                        err = q[j] + t[j];
+                    vlog_error( "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name, sizeNames[k], err, ((float*) s)[j], -t[j], q[j] );
+                  error = -1;
+                  goto exit;
+                }
+            }
+        }
+    }
+
+exit:
+    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    {
+        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        {
+            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            return error;
+        }
+    }
+
+    if( (error = clFlush(tinfo->tQueue) ))
+        vlog( "clFlush 3 failed\n" );
+
+
+    if( 0 == ( base & 0x0fffffff) )
+    {
+       if (gVerboseBruteForce)
+       {
+           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
+       } else
+       {
+          vlog("." );
+       }
+       fflush(stdout);
+    }
+
+    return error;
+}
+
+static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data );
+
+int TestMacro_Int_Double(const Func *f, MTdata d)
+{
+    TestInfo    test_info;
+    cl_int      error;
+    size_t      i, j;
+
+    logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
+    // Init test_info
+    memset( &test_info, 0, sizeof( test_info ) );
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale =  1;
+    if (gWimpyMode )
+    {
+        test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+        test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
+    }
+
+    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for every thread
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    {
+        size_t array_size = test_info.threadCount * sizeof( cl_kernel );
+        test_info.k[i] = (cl_kernel*)malloc( array_size );
+        if( NULL == test_info.k[i] )
+        {
+            vlog_error( "Error: Unable to allocate storage for kernels!\n" );
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset( test_info.k[i], 0, array_size );
+    }
+    test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
+    if( NULL == test_info.tinfo )
+    {
+        vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
+    for( i = 0; i < test_info.threadCount; i++ )
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
+        test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if( error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+            goto exit;
+        }
+
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+            /* Qualcomm fix: end */
+            if( error || NULL == test_info.tinfo[i].outBuf[j] )
+            {
+                vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue = clCreateCommandQueueWithProperties(gContext, gDevice, 0, &error);
+        if( NULL == test_info.tinfo[i].tQueue || error )
+        {
+            vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode };
+        if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+            goto exit;
+    }
+
+    if( !gSkipCorrectnessTesting )
+    {
+        error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+
+        if( error )
+            goto exit;
+
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        cl_ulong *p = (cl_ulong *)gIn;
+        for( j = 0; j < BUFFER_SIZE / sizeof( cl_double ); j++ )
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( i = 0; i < PERF_LOOP_COUNT; i++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+        }
+        for( ; j < gMaxVectorSizeIndex; j++ )
+            vlog( "\t     -- " );
+    }
+
+    vlog( "\n" );
+
+exit:
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if( test_info.k[i] )
+        {
+            for( j = 0; j < test_info.threadCount; j++ )
+                clReleaseKernel(test_info.k[i][j]);
+
+            free( test_info.k[i] );
+        }
+    }
+    if( test_info.tinfo )
+    {
+        for( i = 0; i < test_info.threadCount; i++ )
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free( test_info.tinfo );
+    }
+
+    return error;
+}
+
+static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
+{
+    const TestInfo *job = (const TestInfo *) data;
+    size_t  buffer_elements = job->subBufferSize;
+    size_t  buffer_size = buffer_elements * sizeof( cl_double );
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint) job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    dptr    dfunc = job->f->dfunc;
+    cl_uint j, k;
+    cl_int error;
+    int ftz = job->ftz;
+    const char *name = job->f->name;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[ VECTOR_SIZE_COUNT ];
+    cl_long *out[ VECTOR_SIZE_COUNT ];
+    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    {
+        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
+        if( error || NULL == out[j])
+        {
+            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            return error;
+        }
+    }
+
+    // Get that moving
+    if( (error = clFlush(tinfo->tQueue) ))
+        vlog( "clFlush failed\n" );
+
+    // Write the new values to the input array
+    cl_double *p = (cl_double*) gIn + thread_id * buffer_elements;
+    for( j = 0; j < buffer_elements; j++ )
+        p[j] = DoubleFromUInt32( base + j * scale);
+
+    if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
+    {
+        vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
+        return error;
+    }
+
+    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    {
+        //Wait for the map to finish
+        if( (error = clWaitForEvents(1, e + j) ))
+        {
+            vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
+            return error;
+        }
+        if( (error = clReleaseEvent( e[j] ) ))
+        {
+            vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
+        {
+            vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id];  //each worker thread has its own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
+        if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
+
+        if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error( "FAILED -- could not execute kernel\n" );
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if( (error = clFlush(tinfo->tQueue) ))
+        vlog( "clFlush 2 failed\n" );
+
+    if( gSkipCorrectnessTesting )
+        return CL_SUCCESS;
+
+    //Calculate the correctly rounded reference result
+    cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
+    cl_double *s = (cl_double *)p;
+    for( j = 0; j < buffer_elements; j++ )
+        r[j] = dfunc.i_f( s[j] );
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
+    for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
+    {
+        out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
+        if( error || NULL == out[j] )
+        {
+            vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+            return error;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (cl_long*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
+    if( error || NULL == out[j] )
+    {
+        vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
+        return error;
+    }
+
+
+    //Verify data
+    cl_long *t = (cl_long *)r;
+    for( j = 0; j < buffer_elements; j++ )
+    {
+        cl_long *q = out[0];
+
+
+        // If we aren't getting the correctly rounded result
+        if( gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            // If we aren't getting the correctly rounded result
+            if( ftz )
+            {
+                if( IsDoubleSubnormal( s[j]) )
+                {
+                    cl_long correct = dfunc.i_f( +0.0f );
+                    cl_long correct2 = dfunc.i_f( -0.0f );
+                    if( correct == q[j] || correct2 == q[j] )
+                        continue;
+                }
+            }
+
+            cl_ulong err = t[j] - q[j];
+            if( q[j] > t[j] )
+                err = q[j] - t[j];
+            vlog_error( "\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", name,  err, ((double*) gIn)[j], t[j], q[j] );
+            return -1;
+        }
+
+
+        for( k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++ )
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if( -t[j] != q[j] )
+            {
+                if( ftz )
+                {
+                    if( IsDoubleSubnormal( s[j]))
+                    {
+                        int64_t correct = -dfunc.i_f( +0.0f );
+                        int64_t correct2 = -dfunc.i_f( -0.0f );
+                        if( correct == q[j] || correct2 == q[j] )
+                            continue;
+                    }
+                }
+
+                cl_ulong err = -t[j] - q[j];
+                if( q[j] > -t[j] )
+                    err = q[j] + t[j];
+                vlog_error( "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", name, sizeNames[k], err, ((double*) gIn)[j], -t[j], q[j] );
+                return -1;
+            }
+        }
+
+    }
+
+    for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+    {
+        if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
+        {
+            vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
+            return error;
+        }
+    }
+
+    if( (error = clFlush(tinfo->tQueue) ))
+        vlog( "clFlush 3 failed\n" );
+
+
+    if( 0 == ( base & 0x0fffffff) )
+    {
+       if (gVerboseBruteForce)
+       {
+           vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->threadCount);
+       } else
+       {
+          vlog("." );
+       }
+       fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
+
+
+
+
--- a/test_conformance/math_brute_force/mad.c
+++ b/test_conformance/math_brute_force/mad.c
--- a/test_conformance/math_brute_force/main.c
+++ b/test_conformance/math_brute_force/main.c
--- a/test_conformance/math_brute_force/reference_math.c
+++ b/test_conformance/math_brute_force/reference_math.c
--- a/test_conformance/math_brute_force/reference_math.h
+++ b/test_conformance/math_brute_force/reference_math.h
@@ -0,0 +1,232 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef REFERENCE_MATH_H
+#define REFERENCE_MATH_H
+
+#if defined( __APPLE__ )
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/cl.h>
+#endif
+
+// --  for testing float --
+double reference_sinh( double x );
+double reference_sqrt( double x );
+double reference_tanh( double x );
+double reference_acos( double );
+double reference_asin( double );
+double reference_atan( double );
+double reference_atan2( double, double );
+double reference_ceil( double );
+double reference_cosh( double );
+double reference_exp( double );
+double reference_fabs( double );
+double reference_acospi( double );
+double reference_asinpi( double );
+double reference_atanpi( double );
+double reference_atan2pi( double, double );
+double reference_cospi( double );
+double reference_divide( double, double );
+double reference_fract( double, double * );
+float  reference_fma( float, float, float, int );
+double reference_mad( double, double, double );
+double reference_nextafter(double, double );
+double reference_recip( double );
+double reference_rootn( double, int );
+double reference_rsqrt( double );
+double reference_sincos( double, double * );
+double reference_sinpi( double );
+double reference_tanpi( double );
+double reference_pow(double x, double y);
+double reference_pown( double, int );
+double reference_powr( double, double );
+double reference_cos( double );
+double reference_sin( double );
+double reference_tan( double );
+double reference_log( double );
+double reference_log10( double );
+double reference_modf( double, double *n );
+
+double reference_fdim( double, double );
+double reference_add( double, double );
+double reference_subtract( double, double );
+double reference_divide( double, double );
+double reference_multiply( double, double );
+double reference_remquo( double, double, int* );
+double reference_lgamma_r( double, int* );
+
+int reference_isequal( double, double );
+int reference_isfinite( double );
+int reference_isgreater( double, double );
+int reference_isgreaterequal( double, double );
+int reference_isinf( double );
+int reference_isless( double, double );
+int reference_islessequal( double, double );
+int reference_islessgreater( double, double );
+int reference_isnan( double );
+int reference_isnormal( double );
+int reference_isnotequal( double, double );
+int reference_isordered( double, double );
+int reference_isunordered( double, double );
+int reference_signbit( float );
+
+double reference_acosh( double x );
+double reference_asinh( double x );
+double reference_atanh( double x );
+double reference_cbrt(double x);
+float reference_copysign( float x, float y);
+double reference_copysignd( double x, double y);
+double reference_exp10( double );
+double reference_exp2( double x );
+double reference_expm1( double x );
+double reference_fmax( double x, double y );
+double reference_fmin( double x, double y );
+double reference_hypot( double x, double y );
+double reference_lgamma( double x);
+int    reference_ilogb( double );
+double reference_log2( double x );
+double reference_log1p( double x );
+double reference_logb( double x );
+double reference_maxmag( double x, double y );
+double reference_minmag( double x, double y );
+double reference_nan( cl_uint x );
+double reference_reciprocal( double x );
+double reference_remainder( double x, double y );
+double reference_rint( double x );
+double reference_round( double x );
+double reference_trunc( double x );
+double reference_floor( double x );
+double reference_fmod( double x, double y );
+double reference_frexp( double x, int *n );
+double reference_ldexp( double x, int n );
+
+double reference_assignment( double x );
+int    reference_not( double x );
+// -- for testing fast-relaxed
+
+double reference_relaxed_mad( double, double, double );
+double reference_relaxed_divide( double x, double y );
+double reference_relaxed_sin( double x );
+double reference_relaxed_cos( double x );
+double reference_relaxed_sincos( double x, double * y);
+double reference_relaxed_tan( double x );
+double reference_relaxed_exp( double x );
+double reference_relaxed_exp2( double x );
+double reference_relaxed_exp10( double x );
+double reference_relaxed_log( double x );
+double reference_relaxed_log2( double x );
+double reference_relaxed_pow( double x, double y);
+double reference_relaxed_reciprocal( double x );
+
+// -- for testing double --
+
+long double reference_sinhl( long double x );
+long double reference_sqrtl( long double x );
+long double reference_tanhl( long double x );
+long double reference_acosl( long double );
+long double reference_asinl( long double );
+long double reference_atanl( long double );
+long double reference_atan2l( long double, long double );
+long double reference_ceill( long double );
+long double reference_coshl( long double );
+long double reference_expl( long double );
+long double reference_fabsl( long double );
+long double reference_acospil( long double );
+long double reference_asinpil( long double );
+long double reference_atanpil( long double );
+long double reference_atan2pil( long double, long double );
+long double reference_cospil( long double );
+long double reference_dividel( long double, long double );
+long double reference_fractl( long double, long double * );
+long double reference_fmal( long double, long double, long double );
+long double reference_madl( long double, long double, long double );
+long double reference_nextafterl(long double, long double );
+long double reference_recipl( long double );
+long double reference_rootnl( long double, int );
+long double reference_rsqrtl( long double );
+long double reference_sincosl( long double, long double * );
+long double reference_sinpil( long double );
+long double reference_tanpil( long double );
+long double reference_powl(long double x, long double y);
+long double reference_pownl( long double, int );
+long double reference_powrl( long double, long double );
+long double reference_cosl( long double );
+long double reference_sinl(long double );
+long double reference_tanl( long double );
+long double reference_logl( long double );
+long double reference_log10l( long double );
+long double reference_modfl( long double, long double *n );
+
+
+long double reference_fdiml( long double, long double );
+long double reference_addl( long double, long double );
+long double reference_subtractl( long double, long double );
+long double reference_dividel( long double, long double );
+long double reference_multiplyl( long double, long double );
+long double reference_remquol( long double, long double, int* );
+long double reference_lgamma_rl( long double, int* );
+
+
+int reference_isequall( long double, long double );
+int reference_isfinitel( long double );
+int reference_isgreaterl( long double, long double );
+int reference_isgreaterequall( long double, long double );
+int reference_isinfl( long double );
+int reference_islessl( long double, long double );
+int reference_islessequall( long double, long double );
+int reference_islessgreaterl( long double, long double );
+int reference_isnanl( long double );
+int reference_isnormall( long double );
+int reference_isnotequall( long double, long double );
+int reference_isorderedl( long double, long double );
+int reference_isunorderedl( long double, long double );
+int reference_signbitl( long double );
+
+long double reference_acoshl( long double x );
+long double reference_asinhl( long double x );
+long double reference_atanhl( long double x );
+long double reference_cbrtl(long double x);
+long double reference_copysignl( long double x, long double y);
+long double reference_exp10l( long double );
+long double reference_exp2l( long double x );
+long double reference_expm1l( long double x );
+long double reference_fmaxl( long double x, long double y );
+long double reference_fminl( long double x, long double y );
+long double reference_hypotl( long double x, long double y );
+long double reference_lgammal( long double x);
+int    reference_ilogbl( long double );
+long double reference_log2l( long double x );
+long double reference_log1pl( long double x );
+long double reference_logbl( long double x );
+long double reference_maxmagl( long double x, long double y );
+long double reference_minmagl( long double x, long double y );
+long double reference_nanl( cl_ulong x );
+long double reference_reciprocall( long double x );
+long double reference_remainderl( long double x, long double y );
+long double reference_rintl( long double x );
+long double reference_roundl( long double x );
+long double reference_truncl( long double x );
+long double reference_floorl( long double x );
+long double reference_fmodl( long double x, long double y );
+long double reference_frexpl( long double x, int *n );
+long double reference_ldexpl( long double x, int n );
+
+long double reference_assignmentl( long double x );
+int reference_notl( long double x );
+
+#endif
+
+
--- a/test_conformance/math_brute_force/run_math_brute_force_in_parallel.py
+++ b/test_conformance/math_brute_force/run_math_brute_force_in_parallel.py
@@ -0,0 +1,110 @@
+#! /usr/bin/python
+
+#  //  OpenCL Conformance Tests
+#  // 
+#  //  Copyright:	(c) 2009-2013 by Apple Inc. All Rights Reserved.
+#  //
+
+import os, re, sys, subprocess, time
+
+# A script to run the entierty of math_brute_force, but to run each separate job in parallel.
+
+def DEBUG(text, level=1):
+ if (DEBUG_LEVEL >= level): print(text)
+
+def write_info(text):
+ print text,
+ if (ATF):
+  ATF_log.write("<Info>"+text+"</Info>\n")
+  ATF_log.flush()
+  
+def write_error(text):
+ print "ERROR:" + text,
+ if (ATF):
+  ATF_log.write("<Error>"+text+"</Error>\n")
+  ATF_log.flush()
+  
+def start_atf():
+ global ATF, ATF_log
+ DEBUG("start_atf()")
+ if (os.environ.get("ATF_RESULTSDIRECTORY") == None):
+  ATF = False
+  DEBUG("\tATF not defined",0)
+  return
+ ATF = True
+ ATF_output_file_name = "TestLog.xml"
+ output_path = os.environ.get("ATF_RESULTSDIRECTORY")
+ try:
+	ATF_log = open(output_path+ATF_output_file_name, "w")
+ except IOError:
+  DEBUG("Could not open ATF file " + ATF_output_file_name, 0)
+  ATF = False
+  return
+ DEBUG("ATF Enabled")
+ # Generate the XML header
+ ATF_log.write("<Log>\n")
+ ATF_log.write("<TestStart/>\n")
+ DEBUG("Done start_atf()")
+
+def stop_atf():
+ DEBUG("stop_atf()")
+ if (ATF):
+  ATF.write("<TestFinish/>\n")
+  ATF.write("</Log>\n")
+  ATF.close()
+
+def get_time() :
+ return time.strftime("%A %H:%M:%S", time.localtime())
+
+def start_test(id):
+ DEBUG("start_test("+str(id) + ")")
+ command = test + " " + str(id) + " " + str(id)
+ try:
+  write_info(get_time() + " Executing " + command + "...")
+  p = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+ except OSError:
+  write_error("Failed to execute " + command)
+  return
+ running_tests[id] = p
+ DEBUG("start_test("+str(id) + ") added: " + str(running_tests[id]) + \
+ ", now " + str(len(running_tests.keys())) + " tests running")
+
+
+
+
+DEBUG_LEVEL = 2
+test = "./bruteforce -w"
+instances = 4
+max_test_ID = 12
+running_tests = {}
+ATF_log = None
+ATF = False
+
+# Start the ATF log
+start_atf()
+next_test = 0
+next_test_to_finish = 0
+
+while ( (next_test <= max_test_ID) | (next_test_to_finish <= max_test_ID)):
+ # If we want to run more tests, start them
+ while ((len(running_tests.keys()) < instances) & (next_test <= max_test_ID)):
+  start_test(next_test)
+  next_test = next_test + 1
+  time.sleep(1)
+ # Check if the oldest test has finished
+ p = running_tests[next_test_to_finish]
+ if (p.poll() != None):
+  write_info(get_time() + " Test " + str(next_test_to_finish) +" finished.")
+  del running_tests[next_test_to_finish]
+  next_test_to_finish = next_test_to_finish + 1
+  # Write the results from the test out
+  for line in p.stdout.readlines():
+   write_info(line)
+  for line in p.stderr.readlines():
+   write_error(line)
+   
+ time.sleep(1)
+ 
+
+# Stop the ATF log
+stop_atf()
--- a/test_conformance/math_brute_force/ternary.c
+++ b/test_conformance/math_brute_force/ternary.c
--- a/test_conformance/math_brute_force/unary.c
+++ b/test_conformance/math_brute_force/unary.c
--- a/test_conformance/math_brute_force/unary_two_results.c
+++ b/test_conformance/math_brute_force/unary_two_results.c
@@ -0,0 +1,993 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "Utility.h"
+
+#include <string.h>
+#include "FunctionList.h"
+
+int TestFunc_Float2_Float(const Func *f, MTdata);
+int TestFunc_Double2_Double(const Func *f, MTdata);
+
+#if defined(__cplusplus)
+    extern "C"
+#endif
+const vtbl _unary_two_results = { "unary_two_results", TestFunc_Float2_Float, TestFunc_Double2_Double };
+
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i], out2 + i );\n"
+                            "}\n"
+                        };
+
+    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* out2, __global float* in)\n"
+                            "{\n"
+                            "   size_t i = get_global_id(0);\n"
+                            "   if( i + 1 < get_global_size(0) )\n"
+                            "   {\n"
+                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
+                            "       float3 iout = NAN;\n"
+                            "       f0 = ", name, "( f0, &iout );\n"
+                            "       vstore3( f0, 0, out + 3*i );\n"
+                            "       vstore3( iout, 0, out2 + 3*i );\n"
+                            "   }\n"
+                            "   else\n"
+                            "   {\n"
+                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                            "       float3 iout = NAN;\n"
+                            "       float3 f0;\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 1:\n"
+                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+                            "               break;\n"
+                            "           case 0:\n"
+                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+                            "               break;\n"
+                            "       }\n"
+                            "       f0 = ", name, "( f0, &iout );\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 0:\n"
+                            "               out[3*i+1] = f0.y; \n"
+                            "               out2[3*i+1] = iout.y; \n"
+                            "               // fall through\n"
+                            "           case 1:\n"
+                            "               out[3*i] = f0.x; \n"
+                            "               out2[3*i] = iout.x; \n"
+                            "               break;\n"
+                            "       }\n"
+                            "   }\n"
+                            "}\n"
+                        };
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+
+}
+
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i], out2 + i );\n"
+                            "}\n"
+                        };
+
+    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* out2, __global double* in)\n"
+                            "{\n"
+                            "   size_t i = get_global_id(0);\n"
+                            "   if( i + 1 < get_global_size(0) )\n"
+                            "   {\n"
+                            "       double3 f0 = vload3( 0, in + 3 * i );\n"
+                            "       double3 iout = NAN;\n"
+                            "       f0 = ", name, "( f0, &iout );\n"
+                            "       vstore3( f0, 0, out + 3*i );\n"
+                            "       vstore3( iout, 0, out2 + 3*i );\n"
+                            "   }\n"
+                            "   else\n"
+                            "   {\n"
+                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                            "       double3 iout = NAN;\n"
+                            "       double3 f0;\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 1:\n"
+                            "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+                            "               break;\n"
+                            "           case 0:\n"
+                            "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+                            "               break;\n"
+                            "       }\n"
+                            "       f0 = ", name, "( f0, &iout );\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 0:\n"
+                            "               out[3*i+1] = f0.y; \n"
+                            "               out2[3*i+1] = iout.y; \n"
+                            "               // fall through\n"
+                            "           case 1:\n"
+                            "               out[3*i] = f0.x; \n"
+                            "               out2[3*i] = iout.x; \n"
+                            "               break;\n"
+                            "       }\n"
+                            "   }\n"
+                            "}\n"
+                        };
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint     offset;            // the first vector size to build
+    cl_kernel   *kernels;
+    cl_program  *programs;
+    const char  *nameInCode;
+}BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+int TestFunc_Float2_Float(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    uint32_t l;
+    int error;
+    char const * testing_mode;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    float maxError0 = 0.0f;
+    float maxError1 = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    float maxErrorVal0 = 0.0f;
+    float maxErrorVal1 = 0.0f;
+    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    uint64_t step = bufferSize / sizeof( float );
+    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
+    cl_uchar overflow[BUFFER_SIZE / sizeof( float )];
+    int isFract = 0 == strcmp( "fract", f->nameInCode );
+    int skipNanInf = isFract  && ! gInfNanSupport;
+    float float_ulps;
+
+    logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+    if( gIsEmbedded )
+        float_ulps = f->float_embedded_ulps;
+    else
+        float_ulps = f->float_ulps;
+
+    if (gTestFastRelaxed)
+      float_ulps = f->relaxed_error;
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        return error;
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    for( i = 0; i < (1ULL<<32); i += step )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if( gWimpyMode )
+        {
+          for( j = 0; j < bufferSize / sizeof( float ); j++ )
+          {
+            p[j] = (uint32_t) i + j * scale;
+            if ( gTestFastRelaxed && strcmp(f->name,"sincos") == 0 )
+            {
+              float pj = *(float *)&p[j];
+              if(fabs(pj) > M_PI)
+                p[j] = NAN;
+            }
+          }
+        }
+        else
+        {
+          for( j = 0; j < bufferSize / sizeof( float ); j++ )
+          {
+            p[j] = (uint32_t) i + j;
+            if ( gTestFastRelaxed && strcmp(f->name,"sincos") == 0 )
+            {
+              float pj = *(float *)&p[j];
+              if(fabs(pj) > M_PI)
+                p[j] = NAN;
+            }
+          }
+        }
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            {
+                vlog_error( "FAILED -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        FPU_mode_type oldMode;
+        RoundingMode oldRoundMode = kRoundToNearestEven;
+        if( isFract )
+        {
+            //Calculate the correctly rounded reference result
+            memset( &oldMode, 0, sizeof( oldMode ) );
+            if( ftz )
+                ForceFTZ( &oldMode );
+
+            // Set the rounding mode to match the device
+            if (gIsInRTZMode)
+                oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        }
+
+        //Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        float *r2 = (float *)gOut_Ref2;
+        float *s = (float *)gIn;
+
+        if( skipNanInf )
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            {
+                double dd;
+                feclearexcept(FE_OVERFLOW);
+
+                if( gTestFastRelaxed )
+                    r[j] = (float) f->rfunc.f_fpf( s[j], &dd );
+                else
+                    r[j] = (float) f->func.f_fpf( s[j], &dd );
+
+                r2[j] = (float) dd;
+                overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            }
+        }
+        else
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            {
+                double dd;
+                if( gTestFastRelaxed )
+                  r[j] = (float) f->rfunc.f_fpf( s[j], &dd );
+                else
+                  r[j] = (float) f->func.f_fpf( s[j], &dd );
+
+                r2[j] = (float) dd;
+            }
+        }
+
+        if( isFract && ftz )
+            RestoreFPState( &oldMode );
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray2 failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+        {
+            if (isFract && gIsInRTZMode)
+                (void)set_round(oldRoundMode, kfloat);
+            break;
+        }
+
+        //Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        uint32_t *t2 = (uint32_t *)gOut_Ref2;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint32_t *q = (uint32_t *)gOut[k];
+                uint32_t *q2 = (uint32_t *)gOut2[k];
+
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] || t2[j] != q2[j]  )
+                {
+                    double correct, correct2;
+                    float err, err2;
+                    float test = ((float*) q)[j];
+                    float test2 = ((float*) q2)[j];
+
+                    if( gTestFastRelaxed )
+                      correct = f->rfunc.f_fpf( s[j], &correct2 );
+                    else
+                      correct = f->func.f_fpf( s[j], &correct2 );
+
+                    // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
+                    if (gTestFastRelaxed || skipNanInf)
+                    {
+                        if (skipNanInf && overflow[j])
+                            continue;
+
+                        // Note: no double rounding here.  Reference functions calculate in single precision.
+                        if( IsFloatInfinity(correct) || IsFloatNaN(correct)     ||
+                            IsFloatInfinity(correct2)|| IsFloatNaN(correct2)    ||
+                            IsFloatInfinity(s[j])    || IsFloatNaN(s[j])        )
+                            continue;
+                    }
+
+                    typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
+                    CheckForSubnormal isFloatResultSubnormalPtr;
+                    if( gTestFastRelaxed )
+                    {
+                      err = Abs_Error( test, correct);
+                      err2 = Abs_Error( test2, correct2);
+                      isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
+                    }
+                    else
+                    {
+                        err = Ulp_Error( test, correct );
+                        err2 = Ulp_Error( test2, correct2 );
+                        isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
+                    }
+                    int fail = ! (fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps);
+
+                    if( ftz )
+                    {
+                        // retry per section 6.5.3.2
+                        if( (*isFloatResultSubnormalPtr)(correct, float_ulps) )
+                        {
+                            if( (*isFloatResultSubnormalPtr) (correct2, float_ulps ))
+                            {
+                                fail = fail && ! ( test == 0.0f && test2 == 0.0f );
+                                if( ! fail )
+                                {
+                                    err = 0.0f;
+                                    err2 = 0.0f;
+                                }
+                            }
+                            else
+                            {
+                                fail = fail && ! ( test == 0.0f && fabsf(err2) <= float_ulps);
+                                if( ! fail )
+                                    err = 0.0f;
+                            }
+                        }
+                        else if( (*isFloatResultSubnormalPtr)(correct2, float_ulps ) )
+                        {
+                            fail = fail && ! ( test2 == 0.0f && fabsf(err) <= float_ulps);
+                            if( ! fail )
+                                err2 = 0.0f;
+                        }
+
+
+                        // retry per section 6.5.3.3
+                        if( IsFloatSubnormal( s[j] ) )
+                        {
+                            double correctp, correctn;
+                            double correct2p, correct2n;
+                            float errp, err2p, errn, err2n;
+
+                            if( skipNanInf )
+                                feclearexcept(FE_OVERFLOW);
+                            if ( gTestFastRelaxed )
+                            {
+                              correctp = f->rfunc.f_fpf( 0.0, &correct2p );
+                              correctn = f->rfunc.f_fpf( -0.0, &correct2n );
+                            }
+                            else
+                            {
+                              correctp = f->func.f_fpf( 0.0, &correct2p );
+                              correctn = f->func.f_fpf( -0.0, &correct2n );
+                            }
+
+                            // Per section 10 paragraph 6, accept any result if an input or output is a infinity or NaN or overflow
+                            if( skipNanInf )
+                            {
+                                if( fetestexcept(FE_OVERFLOW) )
+                                    continue;
+
+                                // Note: no double rounding here.  Reference functions calculate in single precision.
+                                if( IsFloatInfinity(correctp) || IsFloatNaN(correctp)   ||
+                                    IsFloatInfinity(correctn) || IsFloatNaN(correctn)   ||
+                                    IsFloatInfinity(correct2p) || IsFloatNaN(correct2p) ||
+                                    IsFloatInfinity(correct2n) || IsFloatNaN(correct2n) )
+                                    continue;
+                            }
+
+                            if ( gTestFastRelaxed )
+                            {
+                              errp = Abs_Error( test, correctp  );
+                              err2p = Abs_Error( test, correct2p  );
+                              errn = Abs_Error( test, correctn  );
+                              err2n = Abs_Error( test, correct2n  );
+                            }
+                            else
+                            {
+                              errp = Ulp_Error( test, correctp  );
+                              err2p = Ulp_Error( test, correct2p  );
+                              errn = Ulp_Error( test, correctn  );
+                              err2n = Ulp_Error( test, correct2n  );
+                            }
+
+                            fail =  fail && ((!(fabsf(errp) <= float_ulps)) && (!(fabsf(err2p) <= float_ulps))    &&
+                                            ((!(fabsf(errn) <= float_ulps)) && (!(fabsf(err2n) <= float_ulps))) );
+                            if( fabsf( errp ) < fabsf(err ) )
+                                err = errp;
+                            if( fabsf( errn ) < fabsf(err ) )
+                                err = errn;
+                            if( fabsf( err2p ) < fabsf(err2 ) )
+                                err2 = err2p;
+                            if( fabsf( err2n ) < fabsf(err2 ) )
+                                err2 = err2n;
+
+                            // retry per section 6.5.3.4
+                            if(  (*isFloatResultSubnormalPtr)( correctp, float_ulps ) || (*isFloatResultSubnormalPtr)( correctn, float_ulps )  )
+                            {
+                              if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
+                              {
+                                fail = fail && !( test == 0.0f && test2 == 0.0f);
+                                if( ! fail )
+                                  err = err2 = 0.0f;
+                              }
+                              else
+                              {
+                                fail = fail && ! (test == 0.0f && fabsf(err2) <= float_ulps);
+                                if( ! fail )
+                                  err = 0.0f;
+                              }
+                            }
+                            else if( (*isFloatResultSubnormalPtr)( correct2p, float_ulps ) || (*isFloatResultSubnormalPtr)( correct2n, float_ulps ) )
+                            {
+                                fail = fail && ! (test2 == 0.0f && (fabsf(err) <= float_ulps));
+                                if( ! fail )
+                                    err2 = 0.0f;
+                            }
+                        }
+                    }
+                    if( fabsf(err ) > maxError0 )
+                    {
+                        maxError0 = fabsf(err);
+                        maxErrorVal0 = s[j];
+                    }
+                    if( fabsf(err2 ) > maxError1 )
+                    {
+                        maxError1 = fabsf(err2);
+                        maxErrorVal1 = s[j];
+                    }
+                    if( fail )
+                    {
+                        vlog_error( "\nERROR: %s%s: {%f, %f} ulp error at %a: *{%a, %a} vs. {%a, %a}\n", f->name, sizeNames[k], err, err2, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((float*) gOut_Ref2)[j], test, test2 );
+                      error = -1;
+                      goto exit;
+                    }
+                }
+            }
+        }
+
+        if (isFract && gIsInRTZMode)
+            (void)set_round(oldRoundMode, kfloat);
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+           if (gVerboseBruteForce)
+           {
+               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+           } else
+           {
+              vlog(".");
+           }
+           fflush(stdout);
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            p[j] = genrand_int32(d);
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+        vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
+    vlog( "\n" );
+
+exit:
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+int TestFunc_Double2_Double(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    float maxError0 = 0.0f;
+    float maxError1 = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal0 = 0.0f;
+    double maxErrorVal1 = 0.0f;
+    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    uint64_t step = bufferSize / sizeof( cl_double );
+    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( cl_double )) + 1);
+
+    logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
+                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                &build_info ) ))
+    {
+        return error;
+    }
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    for( i = 0; i < (1ULL<<32); i += step )
+    {
+        //Init input array
+        double *p = (double *)gIn;
+        if( gWimpyMode )
+        {
+            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+                p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
+        }
+        else
+        {
+            for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+                p[j] = DoubleFromUInt32((uint32_t) i + j);
+        }
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg(kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            {
+                vlog_error( "FAILED -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        //Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        double *r2 = (double *)gOut_Ref2;
+        double *s = (double *)gIn;
+        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        {
+            long double dd;
+            r[j] = (double) f->dfunc.f_fpf( s[j], &dd );
+            r2[j] = (double) dd;
+        }
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray2 failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+            break;
+
+        //Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        uint64_t *t2 = (uint64_t *)gOut_Ref2;
+        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+                uint64_t *q2 = (uint64_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] || t2[j] != q2[j]  )
+                {
+                    double test = ((double*) q)[j];
+                    double test2 = ((double*) q2)[j];
+                    long double correct2;
+                    long double correct = f->dfunc.f_fpf( s[j], &correct2 );
+                    float err = Ulp_Error_Double( test, correct );
+                    float err2 = Ulp_Error_Double( test2, correct2 );
+                    int fail = ! (fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps);
+                    if( ftz )
+                    {
+                        // retry per section 6.5.3.2
+                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        {
+                            if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
+                            {
+                                fail = fail && ! ( test == 0.0f && test2 == 0.0f );
+                                if( ! fail )
+                                {
+                                    err = 0.0f;
+                                    err2 = 0.0f;
+                                }
+                            }
+                            else
+                            {
+                                fail = fail && ! ( test == 0.0f && fabsf(err2) <= f->double_ulps);
+                                if( ! fail )
+                                    err = 0.0f;
+                            }
+                        }
+                        else if( IsDoubleResultSubnormal( correct2, f->double_ulps ) )
+                        {
+                            fail = fail && ! ( test2 == 0.0f && fabsf(err) <= f->double_ulps);
+                            if( ! fail )
+                                err2 = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if( IsDoubleSubnormal( s[j] ) )
+                        {
+                            long double correct2p, correct2n;
+                            long double correctp = f->dfunc.f_fpf( 0.0, &correct2p );
+                            long double correctn = f->dfunc.f_fpf( -0.0, &correct2n );
+                            float errp = Ulp_Error_Double( test, correctp  );
+                            float err2p = Ulp_Error_Double( test, correct2p  );
+                            float errn = Ulp_Error_Double( test, correctn  );
+                            float err2n = Ulp_Error_Double( test, correct2n  );
+                            fail =  fail && ((!(fabsf(errp) <= f->double_ulps)) && (!(fabsf(err2p) <= f->double_ulps))    &&
+                                            ((!(fabsf(errn) <= f->double_ulps)) && (!(fabsf(err2n) <= f->double_ulps))) );
+                            if( fabsf( errp ) < fabsf(err ) )
+                                err = errp;
+                            if( fabsf( errn ) < fabsf(err ) )
+                                err = errn;
+                            if( fabsf( err2p ) < fabsf(err2 ) )
+                                err2 = err2p;
+                            if( fabsf( err2n ) < fabsf(err2 ) )
+                                err2 = err2n;
+
+                            // retry per section 6.5.3.4
+                            if( IsDoubleResultSubnormal( correctp, f->double_ulps ) || IsDoubleResultSubnormal( correctn, f->double_ulps ) )
+                            {
+                                if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
+                                {
+                                    fail = fail && !( test == 0.0f && test2 == 0.0f);
+                                    if( ! fail )
+                                        err = err2 = 0.0f;
+                                }
+                                else
+                                {
+                                    fail = fail && ! (test == 0.0f && fabsf(err2) <= f->double_ulps);
+                                    if( ! fail )
+                                        err = 0.0f;
+                                }
+                            }
+                            else if( IsDoubleResultSubnormal( correct2p, f->double_ulps ) || IsDoubleResultSubnormal( correct2n, f->double_ulps ) )
+                            {
+                                fail = fail && ! (test2 == 0.0f && (fabsf(err) <= f->double_ulps));
+                                if( ! fail )
+                                    err2 = 0.0f;
+                            }
+                        }
+                    }
+                    if( fabsf(err ) > maxError0 )
+                    {
+                        maxError0 = fabsf(err);
+                        maxErrorVal0 = s[j];
+                    }
+                    if( fabsf(err2 ) > maxError1 )
+                    {
+                        maxError1 = fabsf(err2);
+                        maxErrorVal1 = s[j];
+                    }
+                    if( fail )
+                    {
+                        vlog_error( "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: *{%.13la, %.13la} vs. {%.13la, %.13la}\n", f->name, sizeNames[k], err, err2, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((double*) gOut_Ref2)[j], test, test2 );
+                      error = -1;
+                      goto exit;
+                    }
+                }
+            }
+        }
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+           if (gVerboseBruteForce)
+           {
+               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+           } else
+           {
+              vlog("." );
+           }
+           fflush(stdout);
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        double *p = (double*) gIn;
+        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+            p[j] = DoubleFromUInt32(genrand_int32(d) );
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg(kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j]) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+        }
+        for( ; j < gMaxVectorSizeIndex; j++ )
+            vlog( "\t     -- " );
+    }
+
+    if( ! gSkipCorrectnessTesting )
+        vlog( "\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, maxErrorVal1 );
+    vlog( "\n" );
+
+exit:
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+
+
--- a/test_conformance/math_brute_force/unary_two_results_i.c
+++ b/test_conformance/math_brute_force/unary_two_results_i.c
@@ -0,0 +1,802 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "Utility.h"
+
+#include <limits.h>
+#include <string.h>
+#include "FunctionList.h"
+
+int TestFunc_FloatI_Float(const Func *f, MTdata);
+int TestFunc_DoubleI_Double(const Func *f, MTdata);
+
+#if defined(__cplusplus)
+    extern "C"
+#endif
+const vtbl _unary_two_results_i = { "unary_two_results_i", TestFunc_FloatI_Float, TestFunc_DoubleI_Double };
+
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global float", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i], out2 + i );\n"
+                            "}\n"
+                        };
+    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global int* out2, __global float* in)\n"
+                            "{\n"
+                            "   size_t i = get_global_id(0);\n"
+                            "   if( i + 1 < get_global_size(0) )\n"
+                            "   {\n"
+                            "       float3 f0 = vload3( 0, in + 3 * i );\n"
+                            "       int3 iout = INT_MIN;\n"
+                            "       f0 = ", name, "( f0, &iout );\n"
+                            "       vstore3( f0, 0, out + 3*i );\n"
+                            "       vstore3( iout, 0, out2 + 3*i );\n"
+                            "   }\n"
+                            "   else\n"
+                            "   {\n"
+                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                            "       int3 iout = INT_MIN;\n"
+                            "       float3 f0;\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 1:\n"
+                            "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+                            "               break;\n"
+                            "           case 0:\n"
+                            "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+                            "               break;\n"
+                            "       }\n"
+                            "       f0 = ", name, "( f0, &iout );\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 0:\n"
+                            "               out[3*i+1] = f0.y; \n"
+                            "               out2[3*i+1] = iout.y; \n"
+                            "               // fall through\n"
+                            "           case 1:\n"
+                            "               out[3*i] = f0.x; \n"
+                            "               out2[3*i] = iout.x; \n"
+                            "               break;\n"
+                            "       }\n"
+                            "   }\n"
+                            "}\n"
+                        };
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+
+}
+
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global int", sizeNames[vectorSize], "* out2, __global double", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i], out2 + i );\n"
+                            "}\n"
+                        };
+    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global int* out2, __global double* in)\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   if( i + 1 < get_global_size(0) )\n"
+                        "   {\n"
+                        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+                        "       int3 iout = INT_MIN;\n"
+                        "       f0 = ", name, "( f0, &iout );\n"
+                        "       vstore3( f0, 0, out + 3*i );\n"
+                        "       vstore3( iout, 0, out2 + 3*i );\n"
+                        "   }\n"
+                        "   else\n"
+                        "   {\n"
+                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                        "       int3 iout = INT_MIN;\n"
+                        "       double3 f0;\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 1:\n"
+                        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+                        "               break;\n"
+                        "           case 0:\n"
+                        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+                        "               break;\n"
+                        "       }\n"
+                        "       f0 = ", name, "( f0, &iout );\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 0:\n"
+                        "               out[3*i+1] = f0.y; \n"
+                        "               out2[3*i+1] = iout.y; \n"
+                        "               // fall through\n"
+                        "           case 1:\n"
+                        "               out[3*i] = f0.x; \n"
+                        "               out2[3*i] = iout.x; \n"
+                        "               break;\n"
+                        "       }\n"
+                        "   }\n"
+                        "}\n"
+                    };
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint     offset;            // the first vector size to build
+    cl_kernel   *kernels;
+    cl_program  *programs;
+    const char  *nameInCode;
+}BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+cl_ulong  abs_cl_long( cl_long i );
+cl_ulong  abs_cl_long( cl_long i )
+{
+    cl_long mask = i >> 63;
+    return (i ^ mask) - mask;
+}
+
+int TestFunc_FloatI_Float(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    float float_ulps;
+     uint64_t step = bufferSize / sizeof( float );
+    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( float )) + 1);
+    cl_ulong  maxiError;
+
+    logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+    if( gIsEmbedded )
+        float_ulps = f->float_embedded_ulps;
+    else
+        float_ulps = f->float_ulps;
+
+    maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        return error;
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    for( i = 0; i < (1ULL<<32); i += step )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if( gWimpyMode )
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = (uint32_t) i + j * scale;
+        }
+        else
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = (uint32_t) i + j;
+        }
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            {
+                vlog_error( "FAILED -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        //Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        int *r2 = (int *)gOut_Ref2;
+        float *s = (float *)gIn;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            r[j] = (float) f->func.f_fpI( s[j], r2+j );
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray2 failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+            break;
+
+        //Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                int32_t *q2 = (int32_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] || t2[j] != q2[j] )
+                {
+                    float test = ((float*) q)[j];
+                    int correct2 = INT_MIN;
+                    double correct = f->func.f_fpI( s[j], &correct2 );
+                    float err = Ulp_Error( test, correct );
+                    cl_long iErr = (int64_t) q2[j] - (int64_t) correct2;
+                    int fail = ! (fabsf(err) <= float_ulps && abs_cl_long( iErr ) <= maxiError );
+                    if( ftz )
+                    {
+                        // retry per section 6.5.3.2
+                        if( IsFloatResultSubnormal(correct, float_ulps ) )
+                        {
+                            fail = fail && ! ( test == 0.0f && iErr == 0 );
+                            if( ! fail )
+                                err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if( IsFloatSubnormal( s[j] ) )
+                        {
+                            int correct5, correct6;
+                            double correct3 = f->func.f_fpI( 0.0, &correct5 );
+                            double correct4 = f->func.f_fpI( -0.0, &correct6 );
+                            float err2 = Ulp_Error( test, correct3  );
+                            float err3 = Ulp_Error( test, correct4  );
+                            cl_long iErr2 = (long long) q2[j] - (long long) correct5;
+                            cl_long iErr3 = (long long) q2[j] - (long long) correct6;
+
+                            // Did +0 work?
+                            if( fabsf(err2) <= float_ulps && abs_cl_long( iErr2 ) <= maxiError )
+                            {
+                                err = err2;
+                                iErr = iErr2;
+                                fail = 0;
+                            }
+                            // Did -0 work?
+                            else if(fabsf(err3) <= float_ulps && abs_cl_long( iErr3 ) <= maxiError)
+                            {
+                                err = err3;
+                                iErr = iErr3;
+                                fail = 0;
+                            }
+
+                            // retry per section 6.5.3.4
+                            if( fail && (IsFloatResultSubnormal(correct2, float_ulps ) || IsFloatResultSubnormal(correct3, float_ulps )) )
+                            {
+                                fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
+                                if( ! fail )
+                                {
+                                    err = 0.0f;
+                                    iErr = 0;
+                                }
+                            }
+                        }
+                    }
+                    if( fabsf(err ) > maxError )
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if( llabs(iErr) > maxError2 )
+                    {
+                        maxError2 = llabs(iErr );
+                        maxErrorVal2 = s[j];
+                    }
+
+                    if( fail )
+                    {
+                        vlog_error( "\nERROR: %s%s: {%f, %d} ulp error at %a: *{%a, %d} vs. {%a, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((float*) gIn)[j], ((float*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+           if (gVerboseBruteForce)
+           {
+               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+           } else
+           {
+              vlog("." );
+           }
+           fflush(stdout);
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            p[j] = genrand_int32(d);
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
+    vlog( "\n" );
+
+exit:
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+int TestFunc_DoubleI_Double(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    double maxErrorVal2 = 0.0f;
+    cl_ulong  maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
+    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+
+    uint64_t step = bufferSize / sizeof( double );
+    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
+
+    logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
+                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                &build_info ) ))
+    {
+        return error;
+    }
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    for( i = 0; i < (1ULL<<32); i += step )
+    {
+        //Init input array
+        double *p = (double *)gIn;
+        if( gWimpyMode )
+        {
+            for( j = 0; j < bufferSize / sizeof( double ); j++ )
+                p[j] = DoubleFromUInt32((uint32_t) i + j * scale);
+        }
+        else
+        {
+            for( j = 0; j < bufferSize / sizeof( double ); j++ )
+                p[j] = DoubleFromUInt32((uint32_t) i + j);
+        }
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL) ))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+            {
+                vlog_error( "FAILED -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        //Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        int *r2 = (int *)gOut_Ref2;
+        double *s = (double *)gIn;
+        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+            r[j] = (double) f->dfunc.f_fpI( s[j], r2+j );
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, bufferSize, gOut2[j], 0, NULL, NULL)) )
+            {
+                vlog_error( "ReadArray2 failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+            break;
+
+        //Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+                int32_t *q2 = (int32_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] || t2[j] != q2[j] )
+                {
+                    double test = ((double*) q)[j];
+                    int correct2 = INT_MIN;
+                    long double correct = f->dfunc.f_fpI( s[j], &correct2 );
+                    float err = Ulp_Error_Double( test, correct );
+                    cl_long iErr = (long long) q2[j] - (long long) correct2;
+                    int fail = ! (fabsf(err) <= f->double_ulps && abs_cl_long( iErr ) <= maxiError );
+                    if( ftz )
+                    {
+                        // retry per section 6.5.3.2
+                        if( IsDoubleResultSubnormal(correct, f->double_ulps ) )
+                        {
+                            fail = fail && ! ( test == 0.0f && iErr == 0 );
+                            if( ! fail )
+                                err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if( IsDoubleSubnormal( s[j] ) )
+                        {
+                            int correct5, correct6;
+                            long double correct3 = f->dfunc.f_fpI( 0.0, &correct5 );
+                            long double correct4 = f->dfunc.f_fpI( -0.0, &correct6 );
+                            float err2 = Ulp_Error_Double( test, correct3  );
+                            float err3 = Ulp_Error_Double( test, correct4  );
+                            cl_long iErr2 = (long long) q2[j] - (long long) correct5;
+                            cl_long iErr3 = (long long) q2[j] - (long long) correct6;
+
+                            // Did +0 work?
+                            if( fabsf(err2) <= f->double_ulps && abs_cl_long( iErr2 ) <= maxiError )
+                            {
+                                err = err2;
+                                iErr = iErr2;
+                                fail = 0;
+                            }
+                            // Did -0 work?
+                            else if(fabsf(err3) <= f->double_ulps && abs_cl_long( iErr3 ) <= maxiError)
+                            {
+                                err = err3;
+                                iErr = iErr3;
+                                fail = 0;
+                            }
+
+                            // retry per section 6.5.3.4
+                            if( fail && (IsDoubleResultSubnormal( correct2, f->double_ulps ) || IsDoubleResultSubnormal( correct3, f->double_ulps )) )
+                            {
+                                fail = fail && ! ( test == 0.0f && (abs_cl_long( iErr2 ) <= maxiError || abs_cl_long( iErr3 ) <= maxiError) );
+                                if( ! fail )
+                                {
+                                    err = 0.0f;
+                                    iErr = 0;
+                                }
+                            }
+                        }
+                    }
+                    if( fabsf(err ) > maxError )
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if( llabs(iErr) > maxError2 )
+                    {
+                        maxError2 = llabs(iErr );
+                        maxErrorVal2 = s[j];
+                    }
+
+                    if( fail )
+                    {
+                        vlog_error( "\nERROR: %sD%s: {%f, %d} ulp error at %.13la: *{%.13la, %d} vs. {%.13la, %d}\n", f->name, sizeNames[k], err, (int) iErr, ((double*) gIn)[j], ((double*) gOut_Ref)[j], ((int*) gOut_Ref2)[j], test, q2[j] );
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+           if (gVerboseBruteForce)
+           {
+               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+           } else
+           {
+              vlog("." );
+           }
+           fflush(stdout);
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        double *p = (double *)gIn;
+
+        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gOutBuffer2[j] ), &gOutBuffer2[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 2, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILED -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s", f->name, sizeNames[j] );
+        }
+        for( ; j < gMaxVectorSizeIndex; j++ )
+            vlog( "\t     -- " );
+    }
+
+    if( ! gSkipCorrectnessTesting )
+        vlog( "\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal );
+    vlog( "\n" );
+
+exit:
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+
+
--- a/test_conformance/math_brute_force/unary_u.c
+++ b/test_conformance/math_brute_force/unary_u.c
@@ -0,0 +1,694 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "Utility.h"
+
+#include <string.h>
+#include "FunctionList.h"
+
+int TestFunc_Float_UInt(const Func *f, MTdata);
+int TestFunc_Double_ULong(const Func *f, MTdata);
+
+#if defined( __cplusplus)
+    extern "C"
+#endif
+const vtbl _unary_u = { "unary_u", TestFunc_Float_UInt, TestFunc_Double_ULong };
+
+
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p );
+
+static int BuildKernel( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = {
+                            "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global uint", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i] );\n"
+                            "}\n"
+                        };
+    const char *c3[] = {    "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global uint* in)\n"
+                            "{\n"
+                            "   size_t i = get_global_id(0);\n"
+                            "   if( i + 1 < get_global_size(0) )\n"
+                            "   {\n"
+                            "       uint3 u0 = vload3( 0, in + 3 * i );\n"
+                            "       float3 f0 = ", name, "( u0 );\n"
+                            "       vstore3( f0, 0, out + 3*i );\n"
+                            "   }\n"
+                            "   else\n"
+                            "   {\n"
+                            "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                            "       uint3 u0;\n"
+                            "       float3 f0;\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 1:\n"
+                            "               u0 = (uint3)( in[3*i], 0xdead, 0xdead ); \n"
+                            "               break;\n"
+                            "           case 0:\n"
+                            "               u0 = (uint3)( in[3*i], in[3*i+1], 0xdead ); \n"
+                            "               break;\n"
+                            "       }\n"
+                            "       f0 = ", name, "( u0 );\n"
+                            "       switch( parity )\n"
+                            "       {\n"
+                            "           case 0:\n"
+                            "               out[3*i+1] = f0.y; \n"
+                            "               // fall through\n"
+                            "           case 1:\n"
+                            "               out[3*i] = f0.x; \n"
+                            "               break;\n"
+                            "       }\n"
+                            "   }\n"
+                            "}\n"
+                        };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+}
+
+static int BuildKernelDouble( const char *name, int vectorSize, cl_kernel *k, cl_program *p )
+{
+    const char *c[] = {
+                            "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global ulong", sizeNames[vectorSize], "* in)\n"
+                            "{\n"
+                            "   int i = get_global_id(0);\n"
+                            "   out[i] = ", name, "( in[i] );\n"
+                            "}\n"
+                        };
+
+    const char *c3[] = {    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global ulong* in)\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   if( i + 1 < get_global_size(0) )\n"
+                        "   {\n"
+                        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
+                        "       double3 f0 = ", name, "( u0 );\n"
+                        "       vstore3( f0, 0, out + 3*i );\n"
+                        "   }\n"
+                        "   else\n"
+                        "   {\n"
+                        "       size_t parity = i & 1;   // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
+                        "       ulong3 u0;\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 1:\n"
+                        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
+                        "               break;\n"
+                        "           case 0:\n"
+                        "               u0 = (ulong3)( in[3*i], in[3*i+1], 0xdeaddeaddeaddeadUL ); \n"
+                        "               break;\n"
+                        "       }\n"
+                        "       double3 f0 = ", name, "( u0 );\n"
+                        "       switch( parity )\n"
+                        "       {\n"
+                        "           case 0:\n"
+                        "               out[3*i+1] = f0.y; \n"
+                        "               // fall through\n"
+                        "           case 1:\n"
+                        "               out[3*i] = f0.x; \n"
+                        "               break;\n"
+                        "       }\n"
+                        "   }\n"
+                        "}\n"
+                    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c)/sizeof(c[0]);
+
+    if( sizeValues[vectorSize] == 3 )
+    {
+        kern = c3;
+        kernSize = sizeof(c3)/sizeof(c3[0]);
+    }
+
+
+    char testName[32];
+    snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
+
+    return MakeKernel(kern, (cl_uint) kernSize, testName, k, p);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint     offset;            // the first vector size to build
+    cl_kernel   *kernels;
+    cl_program  *programs;
+    const char  *nameInCode;
+}BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
+static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
+{
+    BuildKernelInfo *info = (BuildKernelInfo*) p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble( info->nameInCode, i, info->kernels + i, info->programs + i );
+}
+
+int TestFunc_Float_UInt(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    float maxErrorVal = 0.0f;
+    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+
+    uint64_t step = bufferSize / sizeof( float );
+    int scale = (int)((1ULL<<32) / (16 * bufferSize / sizeof( double )) + 1);
+    int isRangeLimited = 0;
+    float float_ulps;
+    float half_sin_cos_tan_limit = 0;
+
+    logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+    if( gIsEmbedded)
+        float_ulps = f->float_embedded_ulps;
+    else
+        float_ulps = f->float_ulps;
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
+        return error;
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernel( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") )
+    {
+        isRangeLimited = 1;
+        half_sin_cos_tan_limit = 1.0f + float_ulps * (FLT_EPSILON/2.0f);             // out of range results from finite inputs must be in [-1,1]
+    }
+    else if( 0 == strcmp( f->name, "half_tan"))
+    {
+        isRangeLimited = 1;
+        half_sin_cos_tan_limit = INFINITY;             // out of range resut from finite inputs must be numeric
+    }
+
+
+    for( i = 0; i < (1ULL<<32); i += step  )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if( gWimpyMode )
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = (uint32_t) i + j * scale;
+        }
+        else
+        {
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = (uint32_t) i + j;
+        }
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error( "FAILURE -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        //Calculate the correctly rounded reference result
+        float *r = (float*) gOut_Ref;
+        cl_uint *s = (cl_uint*) gIn;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+            r[j] = (float) f->func.f_u( s[j] );
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+            break;
+
+
+        //Verify data
+        uint32_t *t = (uint32_t*) gOut_Ref;
+        for( j = 0; j < bufferSize / sizeof( float ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint32_t *q = (uint32_t*)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] )
+                {
+                    float test = ((float*) q)[j];
+                    double correct = f->func.f_u( s[j] );
+                    float err = Ulp_Error( test, correct );
+                    int fail = ! (fabsf(err) <= float_ulps);
+
+                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
+                    if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
+                    {
+                        if( fabsf( test ) <= half_sin_cos_tan_limit )
+                        {
+                            err = 0;
+                            fail = 0;
+                        }
+                    }
+
+                     if( fail )
+                    {
+                        if( ftz )
+                        {
+                            // retry per section 6.5.3.2
+                            if( IsFloatResultSubnormal(correct, float_ulps) )
+                            {
+                                fail = fail && ( test != 0.0f );
+                                if( ! fail )
+                                    err = 0.0f;
+                            }
+                        }
+                    }
+                    if( fabsf(err ) > maxError )
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if( fail )
+                    {
+                        vlog_error( "\n%s%s: %f ulp error at 0x%8.8x: *%a vs. %a\n", f->name, sizeNames[k], err, ((uint32_t*) gIn)[j], ((float*) gOut_Ref)[j], test );
+                      error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+           if (gVerboseBruteForce)
+           {
+               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+           } else
+           {
+              vlog("." );
+           }
+           fflush(stdout);
+        }
+    }
+
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        uint32_t *p = (uint32_t*)gIn;
+        if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                ((float*)p)[j] = (float) genrand_real1(d);
+        else if( strstr( f->name, "log" ) )
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = genrand_int32(d) & 0x7fffffff;
+        else
+            for( j = 0; j < bufferSize / sizeof( float ); j++ )
+                p[j] = genrand_int32(d);
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILURE -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( float ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
+    vlog( "\n" );
+
+exit:
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+static cl_ulong random64( MTdata d )
+{
+    return (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
+}
+
+int TestFunc_Double_ULong(const Func *f, MTdata d)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[ VECTOR_SIZE_COUNT ];
+    cl_kernel kernels[ VECTOR_SIZE_COUNT ];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    size_t bufferSize = (gWimpyMode)? gWimpyBufferSize: BUFFER_SIZE;
+    uint64_t step = bufferSize / sizeof( cl_double );
+
+    logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
+    if( gWimpyMode )
+    {
+        step = (1ULL<<32) * gWimpyReductionFactor / (512);
+    }
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, f->nameInCode };
+    if( (error = ThreadPool_Do( BuildKernel_DoubleFn,
+                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                &build_info ) ))
+    {
+        return error;
+    }
+/*
+    for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
+        if( (error =  BuildKernelDouble( f->nameInCode, (int) i, kernels + i, programs + i) ) )
+            return error;
+*/
+
+    for( i = 0; i < (1ULL<<32); i += step  )
+    {
+        //Init input array
+        cl_ulong *p = (cl_ulong *)gIn;
+        for( j = 0; j < bufferSize / sizeof( cl_ulong ); j++ )
+            p[j] = random64(d);
+
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+        // write garbage into output arrays
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error( "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j );
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ))){ LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error( "FAILURE -- could not execute kernel\n" );
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if( (error = clFlush(gQueue) ))
+            vlog( "clFlush failed\n" );
+
+        //Calculate the correctly rounded reference result
+        double *r = (double*) gOut_Ref;
+        cl_ulong *s = (cl_ulong*) gIn;
+        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+            r[j] = (double) f->dfunc.f_u( s[j] );
+
+        // Read the data back
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error( "ReadArray failed %d\n", error );
+                goto exit;
+            }
+        }
+
+        if( gSkipCorrectnessTesting )
+            break;
+
+
+        //Verify data
+        uint64_t *t = (uint64_t*) gOut_Ref;
+        for( j = 0; j < bufferSize / sizeof( cl_double ); j++ )
+        {
+            for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+            {
+                uint64_t *q = (uint64_t*)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if( t[j] != q[j] )
+                {
+                    double test = ((double*) q)[j];
+                    long double correct = f->dfunc.f_u( s[j] );
+                    float err = Ulp_Error_Double(test, correct);
+                    int fail = ! (fabsf(err) <= f->double_ulps);
+
+                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
+                    if( fail )
+                    {
+                        if( ftz )
+                        {
+                            // retry per section 6.5.3.2
+                            if( IsDoubleResultSubnormal(correct, f->double_ulps) )
+                            {
+                                fail = fail && ( test != 0.0 );
+                                if( ! fail )
+                                    err = 0.0f;
+                            }
+                        }
+                    }
+                    if( fabsf(err ) > maxError )
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if( fail )
+                    {
+                        vlog_error( "\n%s%sD: %f ulp error at 0x%16.16llx: *%.13la vs. %.13la\n", f->name, sizeNames[k], err, ((uint64_t*) gIn)[j], ((double*) gOut_Ref)[j], test );
+                      error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if( 0 == (i & 0x0fffffff) )
+        {
+           if (gVerboseBruteForce)
+           {
+               vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step, bufferSize);
+           } else
+           {
+              vlog("." );
+           }
+           fflush(stdout);
+        }
+    }
+
+    if( ! gSkipCorrectnessTesting )
+    {
+        if( gWimpyMode )
+            vlog( "Wimp pass" );
+        else
+            vlog( "passed" );
+    }
+
+    if( gMeasureTimes )
+    {
+        //Init input array
+        double *p = (double*) gIn;
+
+        for( j = 0; j < bufferSize / sizeof( double ); j++ )
+            p[j] = random64(d);
+        if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL) ))
+        {
+            vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
+            return error;
+        }
+
+
+        // Run the kernels
+        for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if( ( error = clSetKernelArg(kernels[j], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(programs[j]); goto exit; }
+            if( ( error = clSetKernelArg( kernels[j], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(programs[j]); goto exit; }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for( k = 0; k < PERF_LOOP_COUNT; k++ )
+            {
+                uint64_t startTime = GetTime();
+                if( (error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
+                {
+                    vlog_error( "FAILURE -- could not execute kernel\n" );
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if( (error = clFinish(gQueue) ) )
+                {
+                    vlog_error( "Error %d at clFinish\n", error );
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime( endTime, startTime );
+                sum += time;
+                if( time < bestTime )
+                    bestTime = time;
+            }
+
+            if( gReportAverageTimes )
+                bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (bufferSize / sizeof( double ) );
+            vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
+        }
+        for( ; j < gMaxVectorSizeIndex; j++ )
+            vlog( "\t     -- " );
+    }
+
+    if( ! gSkipCorrectnessTesting )
+        vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
+    vlog( "\n" );
+
+exit:
+    // Release
+    for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
+
+