Initial open source release of OpenCL 2.2 CTS.

2026-03-19 06:09:01 +00:00 · 2017-05-16 18:25:37 +05:30
parent 6911ba5116
commit 2821bf1323
1035 changed files with 343518 additions and 0 deletions
--- a/test_conformance/subgroups/CMakeLists.txt
+++ b/test_conformance/subgroups/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(MODULE_NAME SUBGROUPS)
+
+set(${MODULE_NAME}_SOURCES
+    main.cpp
+    test_barrier.cpp
+    test_queries.cpp
+    test_workitem.cpp
+    test_workgroup.cpp
+    ../../test_common/harness/errorHelpers.c
+    ../../test_common/harness/testHarness.c
+    ../../test_common/harness/kernelHelpers.c
+    ../../test_common/harness/typeWrappers.cpp
+    ../../test_common/harness/mt19937.c
+    ../../test_common/harness/msvc9.c
+    ../../test_common/harness/ThreadPool.c
+    ../../test_common/harness/conversions.c
+    ../../test_common/harness/parseParameters.cpp
+)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/subgroups/Jamfile
+++ b/test_conformance/subgroups/Jamfile
@@ -0,0 +1,26 @@
+project
+    : requirements
+      <toolset>gcc:<cflags>-xc++
+      <toolset>msvc:<cflags>"/TP"
+    ;
+exe test_subgroups
+    : main.cpp
+      test_queries.cpp
+      test_workitem.cpp
+      test_workgroup.cpp
+      test_barrier.cpp
+      ../../test_common/harness/errorHelpers.c
+      ../../test_common/harness/threadTesting.c
+      ../../test_common/harness/testHarness.c
+      ../../test_common/harness/mt19937.c
+      ../../test_common/harness/conversions.c
+      ../../test_common/harness/kernelHelpers.c
+      ../../test_common/harness/mt19937.c
+    : <target-os>windows:<source>../../test_common/harness/msvc9.c
+    ;
+install dist
+    : test_subgroups
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance/subgroups
+      <variant>release:<location>$(DIST)/release/tests/test_conformance/subgroups
+    ;
+
--- a/test_conformance/subgroups/main.cpp
+++ b/test_conformance/subgroups/main.cpp
@@ -0,0 +1,60 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <string.h>
+#include "procs.h"
+#include "../../test_common/harness/testHarness.h"
+
+MTdata gMTdata;
+
+basefn basefn_list[] = {
+    test_sub_group_info,
+    test_work_item_functions,
+    test_work_group_functions,
+    test_barrier_functions,
+};
+
+const char *basefn_names[] = {
+    "sub_group_info",
+    "work_item_functions",
+    "work_group_functions",
+    "barrier_functions",
+};
+
+ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_list) / sizeof(basefn_list[0])));
+
+static const int num_fns = sizeof(basefn_names) / sizeof(char *);
+
+static int
+checkSubGroupsExtension(cl_device_id device)
+{
+    if (!is_extension_available(device, "cl_khr_subgroups")) {
+        log_info("Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
+        return CL_INVALID_DEVICE;
+    }
+
+    return CL_SUCCESS;
+}
+
+int
+main(int argc, const char *argv[])
+{
+    gMTdata = init_genrand(0);
+    return runTestHarnessWithCheck(argc, argv, num_fns, basefn_list, basefn_names, false, false, NULL, checkSubGroupsExtension);
+}
+
--- a/test_conformance/subgroups/procs.h
+++ b/test_conformance/subgroups/procs.h
@@ -0,0 +1,43 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef _procs_h
+#define _procs_h
+
+#include "../../test_common/harness/testHarness.h"
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/threadTesting.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/mt19937.h"
+
+extern MTdata gMTdata;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int test_sub_group_info(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_work_item_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_work_group_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_barrier_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_pipe_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*_procs_h*/
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -0,0 +1,286 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef SUBHELPERS_H
+#define SUBHELPERS_H
+
+#include "testHarness.h"
+#include "kernelHelpers.h"
+#include "typeWrappers.h"
+
+#include <limits>
+#include <vector>
+
+// Some template helpers
+template <typename Ty> struct TypeName;
+template <> struct TypeName<cl_half> { static const char * val() { return "half"; } };
+template <> struct TypeName<cl_uint> { static const char * val() { return "uint"; } };
+template <> struct TypeName<cl_int> { static const char * val() { return "int"; } };
+template <> struct TypeName<cl_ulong> { static const char * val() { return "ulong"; } };
+template <> struct TypeName<cl_long> { static const char * val() { return "long"; } };
+template <> struct TypeName<float> { static const char * val() { return "float"; } };
+template <> struct TypeName<double> { static const char * val() { return "double"; } };
+
+template <typename Ty> struct TypeDef;
+template <> struct TypeDef<cl_half> { static const char * val() { return "typedef half Type;\n"; } };
+template <> struct TypeDef<cl_uint> { static const char * val() { return "typedef uint Type;\n"; } };
+template <> struct TypeDef<cl_int> { static const char * val() { return "typedef int Type;\n"; } };
+template <> struct TypeDef<cl_ulong> { static const char * val() { return "typedef ulong Type;\n"; } };
+template <> struct TypeDef<cl_long> { static const char * val() { return "typedef long Type;\n"; } };
+template <> struct TypeDef<float> { static const char * val() { return "typedef float Type;\n"; } };
+template <> struct TypeDef<double> { static const char * val() { return "typedef double Type;\n"; } };
+
+template <typename Ty, int Which> struct TypeIdentity;
+// template <> struct TypeIdentity<cl_half,0> { static cl_half val() { return (cl_half)0.0; } };
+// template <> struct TypeIdentity<cl_half,0> { static cl_half val() { return -(cl_half)65536.0; } };
+// template <> struct TypeIdentity<cl_half,0> { static cl_half val() { return (cl_half)65536.0; } };
+
+template <> struct TypeIdentity<cl_uint,0> { static cl_uint val() { return (cl_uint)0; } };
+template <> struct TypeIdentity<cl_uint,1> { static cl_uint val() { return (cl_uint)0; } };
+template <> struct TypeIdentity<cl_uint,2> { static cl_uint val() { return (cl_uint)0xffffffff; } };
+
+template <> struct TypeIdentity<cl_int,0> { static cl_int val() { return (cl_int)0 ; } };
+template <> struct TypeIdentity<cl_int,1> { static cl_int val() { return (cl_int)0x80000000; } };
+template <> struct TypeIdentity<cl_int,2> { static cl_int val() { return (cl_int)0x7fffffff; } };
+
+template <> struct TypeIdentity<cl_ulong,0> { static cl_ulong val() { return (cl_ulong)0 ; } };
+template <> struct TypeIdentity<cl_ulong,1> { static cl_ulong val() { return (cl_ulong)0 ; } };
+template <> struct TypeIdentity<cl_ulong,2> { static cl_ulong val() { return (cl_ulong)0xffffffffffffffffULL ; } };
+
+template <> struct TypeIdentity<cl_long,0> { static cl_long val() { return (cl_long)0; } };
+template <> struct TypeIdentity<cl_long,1> { static cl_long val() { return (cl_long)0x8000000000000000ULL; } };
+template <> struct TypeIdentity<cl_long,2> { static cl_long val() { return (cl_long)0x7fffffffffffffffULL; } };
+
+
+template <> struct TypeIdentity<float,0> { static float val() { return 0.F; } };
+template <> struct TypeIdentity<float,1> { static float val() { return -std::numeric_limits<float>::infinity(); } };
+template <> struct TypeIdentity<float,2> { static float val() { return std::numeric_limits<float>::infinity(); } };
+
+template <> struct TypeIdentity<double,0> { static double val() { return 0.L; } };
+
+template <> struct TypeIdentity<double,1> { static double val() { return -std::numeric_limits<double>::infinity(); } };
+template <> struct TypeIdentity<double,2> { static double val() { return std::numeric_limits<double>::infinity(); } };
+
+template <typename Ty> struct TypeCheck;
+template <> struct TypeCheck<cl_uint> { static bool val(cl_device_id) { return true; } };
+template <> struct TypeCheck<cl_int> { static bool val(cl_device_id) { return true; } };
+
+static bool
+int64_ok(cl_device_id device)
+{
+    char profile[128];
+    int error;
+
+    error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), (void *)&profile, NULL);
+    if (error) {
+        log_info("clGetDeviceInfo failed with CL_DEVICE_PROFILE\n");
+    return false;
+    }
+
+    if (strcmp(profile, "EMBEDDED_PROFILE") == 0)
+     return is_extension_available(device, "cles_khr_int64");
+
+    return true;
+}
+
+template <> struct TypeCheck<cl_ulong> { static bool val(cl_device_id device) { return int64_ok(device); } };
+template <> struct TypeCheck<cl_long> { static bool val(cl_device_id device) { return int64_ok(device); } };
+template <> struct TypeCheck<cl_float> { static bool val(cl_device_id) { return true; } };
+template <> struct TypeCheck<cl_half> {
+    static bool val(cl_device_id device) { return is_extension_available(device, "cl_khr_fp16"); }
+};
+template <> struct TypeCheck<double> {
+    static bool val(cl_device_id device) {
+        int error;
+        cl_device_fp_config c;
+        error = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(c), (void *)&c, NULL);
+        if (error) {
+            log_info("clGetDeviceInfo failed with CL_DEVICE_DOUBLE_FP_CONFIG\n");
+            return false;
+        }
+        return c != 0;
+    }
+};
+
+
+// Run a test kernel to compute the result of a built-in on an input
+static int
+run_kernel(cl_context context, cl_command_queue queue, cl_kernel kernel, size_t global, size_t local,
+           void *idata, size_t isize, void *mdata, size_t msize,
+       void *odata, size_t osize, size_t tsize=0)
+{
+    clMemWrapper in;
+    clMemWrapper xy;
+    clMemWrapper out;
+    clMemWrapper tmp;
+    int error;
+
+    in = clCreateBuffer(context, CL_MEM_READ_ONLY, isize, NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    xy = clCreateBuffer(context, CL_MEM_WRITE_ONLY, msize, NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, osize, NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    if (tsize) {
+        tmp = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, tsize, NULL, &error);
+        test_error(error, "clCreateBuffer failed");
+    }
+
+    error = clSetKernelArg(kernel, 0, sizeof(in), (void *)&in);
+    test_error(error, "clSetKernelArg failed");
+
+    error = clSetKernelArg(kernel, 1, sizeof(xy), (void *)&xy);
+    test_error(error, "clSetKernelArg failed");
+
+    error = clSetKernelArg(kernel, 2, sizeof(out), (void *)&out);
+    test_error(error, "clSetKernelArg failed");
+
+    if (tsize) {
+        error = clSetKernelArg(kernel, 3, sizeof(tmp), (void *)&tmp);
+        test_error(error, "clSetKernelArg failed");
+    }
+
+    error = clEnqueueWriteBuffer(queue, in, CL_FALSE, 0, isize, idata, 0, NULL, NULL);
+    test_error(error, "clEnqueueWriteBuffer failed");
+
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    error = clEnqueueReadBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    error = clEnqueueReadBuffer(queue, out, CL_FALSE, 0, osize, odata, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    error = clFinish(queue);
+    test_error(error, "clFinish failed");
+
+    return error;
+}
+
+// Driver for testing a single built in function
+template <typename Ty, typename Fns, size_t GSIZE, size_t LSIZE, size_t TSIZE=0>
+struct test {
+    static int
+    run(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements, const char *kname, const char *src, int dynscl=0)
+    {
+        size_t tmp;
+        int error;
+        int subgroup_size, num_subgroups;
+        size_t realSize;
+        size_t global;
+        size_t local;
+        const char *kstrings[3];
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        cl_platform_id platform;
+        cl_int sgmap[2*GSIZE];
+        Ty mapin[LSIZE];
+        Ty mapout[LSIZE];
+
+    // Make sure a test of type Ty is supported by the device
+        if (!TypeCheck<Ty>::val(device))
+            return 0;
+
+        error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), (void *)&platform, NULL);
+        test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM");
+
+        kstrings[0] = "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n"
+                  "#define XY(M,I) M[I].x = get_sub_group_local_id(); M[I].y = get_sub_group_id();\n";
+        kstrings[1] = TypeDef<Ty>::val();
+        kstrings[2] = src;
+        error = create_single_kernel_helper_with_build_options(context, &program, &kernel, 3, kstrings, kname, "-cl-std=CL2.0");
+        if (error != 0)
+            return error;
+
+        // Determine some local dimensions to use for the test.
+        global = GSIZE;
+        error = get_max_common_work_group_size(context, kernel, GSIZE, &local);
+        test_error(error, "get_max_common_work_group_size failed");
+
+        // Limit it a bit so we have muliple work groups
+        // Ideally this will still be large enough to give us multiple subgroups
+        if (local > LSIZE)
+            local = LSIZE;
+
+    // Get the sub group info
+        clGetKernelSubGroupInfoKHR_fn clGetKernelSubGroupInfoKHR_ptr;
+        clGetKernelSubGroupInfoKHR_ptr = (clGetKernelSubGroupInfoKHR_fn)clGetExtensionFunctionAddressForPlatform(platform,
+        "clGetKernelSubGroupInfoKHR");
+        if (clGetKernelSubGroupInfoKHR_ptr == NULL) {
+            log_error("ERROR: clGetKernelSubGroupInfoKHR function not available");
+            return -1;
+        }
+
+        error = clGetKernelSubGroupInfoKHR_ptr(kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,
+                                           sizeof(local), (void *)&local, sizeof(tmp), (void *)&tmp, NULL);
+        test_error(error, "clGetKernelSubGroupInfoKHR failed for CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR");
+        subgroup_size = (int)tmp;
+
+        error = clGetKernelSubGroupInfoKHR_ptr(kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR,
+                                           sizeof(local), (void *)&local, sizeof(tmp), (void *)&tmp, NULL);
+        test_error(error, "clGetKernelSubGroupInfoKHR failed for CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR");
+        num_subgroups = (int)tmp;
+
+    // Make sure the number of sub groups is what we expect
+        if (num_subgroups != (local + subgroup_size - 1)/ subgroup_size) {
+            log_error("ERROR: unexpected number of subgroups (%d) returned by clGetKernelSubGroupInfoKHR\n", num_subgroups);
+            return -1;
+        }
+
+        std::vector<Ty> idata;
+        std::vector<Ty> odata;
+        size_t input_array_size = GSIZE;
+        size_t output_array_size = GSIZE;
+
+        if (dynscl != 0) {
+          input_array_size = (int)global / (int)local * num_subgroups * dynscl;
+          output_array_size = (int)global / (int)local * dynscl;
+        }
+
+        idata.resize(input_array_size);
+        odata.resize(output_array_size);
+
+    // Run the kernel once on zeroes to get the map
+    memset(&idata[0], 0, input_array_size * sizeof(Ty));
+        error = run_kernel(context, queue, kernel, global, local,
+                           &idata[0], input_array_size * sizeof(Ty),
+               sgmap, global*sizeof(cl_int)*2,
+               &odata[0], output_array_size * sizeof(Ty),
+               TSIZE*sizeof(Ty));
+    if (error)
+        return error;
+
+    // Generate the desired input for the kernel
+        Fns::gen(&idata[0], mapin, sgmap, subgroup_size, (int)local, (int)global / (int)local);
+
+        error = run_kernel(context, queue, kernel, global, local,
+                           &idata[0], input_array_size * sizeof(Ty),
+               sgmap, global*sizeof(cl_int)*2,
+               &odata[0], output_array_size * sizeof(Ty),
+               TSIZE*sizeof(Ty));
+    if (error)
+        return error;
+
+
+    // Check the result
+    return Fns::chk(&idata[0], &odata[0], mapin, mapout, sgmap, subgroup_size, (int)local, (int)global / (int)local);
+    }
+};
+
+#endif
--- a/test_conformance/subgroups/test_barrier.cpp
+++ b/test_conformance/subgroups/test_barrier.cpp
@@ -0,0 +1,147 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+static const char * lbar_source =
+"__kernel void test_lbar(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    __local int tmp[200];\n"
+"    int gid = get_global_id(0);\n"
+"    int nid = get_sub_group_size();\n"
+"    int lid = get_sub_group_local_id();\n"
+"    xy[gid].x = lid;\n"
+"    xy[gid].y = get_sub_group_id();\n"
+"    if (get_sub_group_id() == 0) {\n"
+"        tmp[lid] = in[gid];\n"
+"        sub_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
+"        out[gid] = tmp[nid-1-lid];\n"
+"    } else {\n"
+"        out[gid] = -in[gid];\n"
+"    }\n"
+"}\n";
+
+static const char * gbar_source =
+"__kernel void test_gbar(const __global Type *in, __global int2 *xy, __global Type *out, __global Type *tmp)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    int nid = get_sub_group_size();\n"
+"    int lid = get_sub_group_local_id();\n"
+"    int tof = get_group_id(0)*get_max_sub_group_size();\n"
+"    xy[gid].x = lid;\n"
+"    xy[gid].y = get_sub_group_id();\n"
+"    if (get_sub_group_id() == 0) {\n"
+"        tmp[tof+lid] = in[gid];\n"
+"        sub_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
+"        out[gid] = tmp[tof+nid-1-lid];\n"
+"    } else {\n"
+"        out[gid] = -in[gid];\n"
+"    }\n"
+"}\n";
+
+// barrier test functions
+template <int Which>
+struct BAR {
+    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
+    {
+        int i, ii, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+        int e;
+
+        ii = 0;
+        for (k=0; k<ng; ++k) {
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                for (i=0;i<n;++i)
+                    t[ii+i] = genrand_int32(gMTdata);
+            }
+
+            // Now map into work group using map from device
+            for (j=0;j<nw;++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                x[j] = t[i];
+            }
+
+            x += nw;
+        m += 2*nw;
+        }
+    }
+
+    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, int ns, int nw, int ng)
+    {
+        int ii, i, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+        cl_int tr, rr;
+
+    if (Which == 0)
+            log_info("  sub_group_barrier(CLK_LOCAL_MEM_FENCE)...\n");
+    else
+            log_info("  sub_group_barrier(CLK_GLOBAL_MEM_FENCE)...\n");
+
+        for (k=0; k<ng; ++k) {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j=0; j<nw; ++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                mx[i] = x[j];
+                my[i] = y[j];
+            }
+
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                for (i=0; i<n; ++i) {
+                    tr = j == 0 ?  mx[ii + n - 1 - i] : -mx[ii + i];
+                    rr = my[ii + i];
+
+                    if (tr != rr) {
+                        log_error("ERROR: sub_group_barrier mismatch for local id %d in sub group %d in group %d\n",
+                                   i, j, k);
+                        return -1;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+        m += 2*nw;
+        }
+
+        return 0;
+    }
+};
+
+
+// Entry point from main
+int
+test_barrier_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+
+    // Adjust these individually below if desired/needed
+#define G 2000
+#define L 200
+
+    error = test<cl_int, BAR<0>, G, L>::run(device, context, queue, num_elements, "test_lbar", lbar_source);
+    error = test<cl_int, BAR<1>, G, L, G>::run(device, context, queue, num_elements, "test_gbar", gbar_source);
+
+    return error;
+}
+
--- a/test_conformance/subgroups/test_queries.cpp
+++ b/test_conformance/subgroups/test_queries.cpp
@@ -0,0 +1,136 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+
+typedef struct {
+    cl_uint maxSubGroupSize;
+    cl_uint numSubGroups;
+} result_data;
+
+static const char * query_kernel_source =
+"#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n"
+"\n"
+"typedef struct {\n"
+"    uint maxSubGroupSize;\n"
+"    uint numSubGroups;\n"
+"} result_data;\n"
+"\n"
+"__kernel void query_kernel( __global result_data *outData )\n"
+"{\n"
+"    int gid = get_global_id( 0 );\n"
+"    outData[gid].maxSubGroupSize = get_max_sub_group_size();\n"
+"    outData[gid].numSubGroups = get_num_sub_groups();\n"
+"}";
+
+int
+test_sub_group_info(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    static const size_t gsize0 = 80;
+    int i, error;
+    size_t realSize;
+    size_t kernel_max_subgroup_size, kernel_subgroup_count;
+    size_t global[] = {gsize0,14,10};
+    size_t local[] = {0,0,0};
+    result_data result[gsize0];
+
+    cl_uint max_dimensions;
+
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_dimensions), &max_dimensions, NULL);
+    test_error(error,  "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS");
+
+    cl_platform_id platform;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper out;
+
+    error = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, &query_kernel_source, "query_kernel", "-cl-std=CL2.0");
+    if (error != 0)
+        return error;
+
+    // Determine some local dimensions to use for the test.
+    if (max_dimensions == 1) {
+        error = get_max_common_work_group_size(context, kernel, global[0], &local[0]);
+        test_error(error, "get_max_common_work_group_size failed");
+    } else if (max_dimensions == 2) {
+        error = get_max_common_2D_work_group_size(context, kernel, global, local);
+        test_error(error, "get_max_common_2D_work_group_size failed");
+    } else {
+        error = get_max_common_3D_work_group_size(context, kernel, global, local);
+        test_error(error, "get_max_common_3D_work_group_size failed");
+    }
+
+    error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), (void *)&platform, NULL);
+    test_error(error, "clDeviceInfo failed for CL_DEVICE_PLATFORM");
+
+    clGetKernelSubGroupInfoKHR_fn clGetKernelSubGroupInfoKHR_ptr;
+    clGetKernelSubGroupInfoKHR_ptr = (clGetKernelSubGroupInfoKHR_fn)clGetExtensionFunctionAddressForPlatform(platform, "clGetKernelSubGroupInfoKHR");
+    if (clGetKernelSubGroupInfoKHR_ptr == NULL) {
+        log_error("ERROR: clGetKernelSubGroupInfoKHR function not available");
+        return -1;
+    }
+
+    error = clGetKernelSubGroupInfoKHR_ptr(kernel, device, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,
+            sizeof(local), (void *)&local, sizeof(kernel_max_subgroup_size), (void *)&kernel_max_subgroup_size, &realSize);
+    test_error(error, "clGetKernelSubGroupInfoKHR failed for CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR");
+    log_info("The CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR for the kernel is %d.\n", (int)kernel_max_subgroup_size);
+
+    if (realSize != sizeof(kernel_max_subgroup_size)) {
+        log_error( "ERROR: Returned size of max sub group size not valid! (Expected %d, got %d)\n", (int)sizeof(kernel_max_subgroup_size), (int)realSize );
+        return -1;
+    }
+
+    error = clGetKernelSubGroupInfoKHR_ptr(kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR,
+            sizeof(local), (void *)&local, sizeof(kernel_subgroup_count), (void *)&kernel_subgroup_count, &realSize);
+    test_error(error, "clGetKernelSubGroupInfoKHR failed for CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR");
+    log_info("The CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR for the kernel is %d.\n", (int)kernel_subgroup_count);
+
+    if (realSize != sizeof(kernel_subgroup_count)) {
+        log_error( "ERROR: Returned size of sub group count not valid! (Expected %d, got %d)\n", (int)sizeof(kernel_subgroup_count), (int)realSize );
+        return -1;
+    }
+
+    // Verify that the kernel gets the same max_subgroup_size and subgroup_count
+    out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(result), NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    error = clSetKernelArg(kernel, 0, sizeof(out), &out);
+    test_error(error, "clSetKernelArg failed");
+
+    error = clEnqueueNDRangeKernel(queue, kernel, max_dimensions, NULL, global, local, 0, NULL, NULL);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    error = clEnqueueReadBuffer(queue, out, CL_FALSE, 0, sizeof(result), &result, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    error = clFinish(queue);
+    test_error(error, "clFinish failed");
+
+    for (i=0; i<(int)gsize0; ++i) {
+        if (result[i].maxSubGroupSize != (cl_uint)kernel_max_subgroup_size) {
+            log_error("ERROR: get_max_subgroup_size() doesn't match result from clGetKernelSubGroupInfoKHR, %u vs %u\n",
+                      result[i].maxSubGroupSize, (cl_uint)kernel_max_subgroup_size);
+            return -1;
+        }
+        if (result[i].numSubGroups != (cl_uint)kernel_subgroup_count) {
+            log_error("ERROR: get_num_sub_groups() doesn't match result from clGetKernelSubGroupInfoKHR, %u vs %u\n",
+                      result[i].numSubGroups, (cl_uint)kernel_subgroup_count);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
--- a/test_conformance/subgroups/test_workgroup.cpp
+++ b/test_conformance/subgroups/test_workgroup.cpp
@@ -0,0 +1,867 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "subhelpers.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+static const char * any_source =
+"__kernel void test_any(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_any(in[gid]);\n"
+"}\n";
+
+static const char * all_source =
+"__kernel void test_all(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_all(in[gid]);\n"
+"}\n";
+
+static const char * bcast_source =
+"__kernel void test_bcast(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    Type x = in[gid];\n"
+"    size_t loid = (size_t)((int)x % 100);\n"
+"    out[gid] = sub_group_broadcast(x, loid);\n"
+"}\n";
+
+static const char * redadd_source =
+"__kernel void test_redadd(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_reduce_add(in[gid]);\n"
+"}\n";
+
+static const char * redmax_source =
+"__kernel void test_redmax(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_reduce_max(in[gid]);\n"
+"}\n";
+
+static const char * redmin_source =
+"__kernel void test_redmin(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_reduce_min(in[gid]);\n"
+"}\n";
+
+static const char * scinadd_source =
+"__kernel void test_scinadd(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
+"}\n";
+
+static const char * scinmax_source =
+"__kernel void test_scinmax(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
+"}\n";
+
+static const char * scinmin_source =
+"__kernel void test_scinmin(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
+"}\n";
+
+static const char * scexadd_source =
+"__kernel void test_scexadd(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
+"}\n";
+
+static const char * scexmax_source =
+"__kernel void test_scexmax(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
+"}\n";
+
+static const char * scexmin_source =
+"__kernel void test_scexmin(const __global Type *in, __global int2 *xy, __global Type *out)\n"
+"{\n"
+"    int gid = get_global_id(0);\n"
+"    XY(xy,gid);\n"
+"    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
+"}\n";
+
+// These need to stay in sync with the kernel source below
+#define NUM_LOC 49
+#define INST_LOC_MASK 0x7f
+#define INST_OP_SHIFT 0
+#define INST_OP_MASK 0xf
+#define INST_LOC_SHIFT 4
+#define INST_VAL_SHIFT 12
+#define INST_VAL_MASK 0x7ffff
+#define INST_END 0x0
+#define INST_STORE 0x1
+#define INST_WAIT 0x2
+#define INST_COUNT 0x3
+
+static const char * ifp_source =
+"#define NUM_LOC 49\n"
+"#define INST_LOC_MASK 0x7f\n"
+"#define INST_OP_SHIFT 0\n"
+"#define INST_OP_MASK 0xf\n"
+"#define INST_LOC_SHIFT 4\n"
+"#define INST_VAL_SHIFT 12\n"
+"#define INST_VAL_MASK 0x7ffff\n"
+"#define INST_END 0x0\n"
+"#define INST_STORE 0x1\n"
+"#define INST_WAIT 0x2\n"
+"#define INST_COUNT 0x3\n"
+"\n"
+"__kernel void\n"
+"test_ifp(const __global int *in, __global int2 *xy, __global int *out)\n"
+"{\n"
+"    __local atomic_int loc[NUM_LOC];\n"
+"\n"
+"    // Don't run if there is only one sub group\n"
+"    if (get_num_sub_groups() == 1)\n"
+"        return;\n"
+"\n"
+"    // First initialize loc[]\n"
+"    int lid = (int)get_local_id(0);\n"
+"\n"
+"    if (lid < NUM_LOC)\n"
+"        atomic_init(loc+lid, 0);\n"
+"\n"
+"    work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+"    // Compute pointer to this sub group's \"instructions\"\n"
+"    const __global int *pc = in +\n"
+"        ((int)get_group_id(0)*(int)get_enqueued_num_sub_groups() +\n"
+"         (int)get_sub_group_id()) *\n"
+"        (NUM_LOC+1);\n"
+"\n"
+"    // Set up to \"run\"\n"
+"    bool ok = (int)get_sub_group_local_id() == 0;\n"
+"    bool run = true;\n"
+"\n"
+"    while (run) {\n"
+"        int inst = *pc++;\n"
+"        int iop = (inst >> INST_OP_SHIFT) & INST_OP_MASK;\n"
+"        int iloc = (inst >> INST_LOC_SHIFT) & INST_LOC_MASK;\n"
+"        int ival = (inst >> INST_VAL_SHIFT) & INST_VAL_MASK;\n"
+"\n"
+"        switch (iop) {\n"
+"        case INST_STORE:\n"
+"            if (ok)\n"
+"                atomic_store(loc+iloc, ival);\n"
+"            break;\n"
+"        case INST_WAIT:\n"
+"            if (ok) {\n"
+"                while (atomic_load(loc+iloc) != ival)\n"
+"                    ;\n"
+"            }\n"
+"            break;\n"
+"        case INST_COUNT:\n"
+"            if (ok) {\n"
+"                int i;\n"
+"                for (i=0;i<ival;++i)\n"
+"                    atomic_fetch_add(loc+iloc, 1);\n"
+"            }\n"
+"            break;\n"
+"        case INST_END:\n"
+"            run = false;\n"
+"            break;\n"
+"        }\n"
+"\n"
+"        sub_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
+"    }\n"
+"\n"
+"    work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+"    // Save this group's result\n"
+"    __global int *op = out + (int)get_group_id(0)*NUM_LOC;\n"
+"    if (lid < NUM_LOC)\n"
+"        op[lid] = atomic_load(loc+lid);\n"
+"}\n";
+
+// Any/All test functions
+template <int Which>
+struct AA {
+    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
+    {
+        int i, ii, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+        int e;
+
+        ii = 0;
+        for (k=0; k<ng; ++k) {
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                e = (int)(genrand_int32(gMTdata) % 3);
+
+                // Initialize data matrix indexed by local id and sub group id
+                switch (e) {
+                case 0:
+                    memset(&t[ii], 0, n*sizeof(cl_int));
+                    break;
+                case 1:
+                    memset(&t[ii], 0, n*sizeof(cl_int));
+                    i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
+                    t[ii + i] = 41;
+                    break;
+                case 2:
+                    memset(&t[ii], 0xff, n*sizeof(cl_int));
+                    break;
+                }
+            }
+
+            // Now map into work group using map from device
+            for (j=0;j<nw;++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                x[j] = t[i];
+            }
+
+            x += nw;
+        m += 2*nw;
+        }
+    }
+
+    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, int ns, int nw, int ng)
+    {
+        int ii, i, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+        cl_int taa, raa;
+
+        log_info("  sub_group_%s...\n", Which == 0 ? "any" : "all");
+
+        for (k=0; k<ng; ++k) {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j=0; j<nw; ++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                mx[i] = x[j];
+                my[i] = y[j];
+            }
+
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                // Compute target
+                if (Which == 0) {
+                    taa = 0;
+                    for (i=0; i<n; ++i)
+                        taa |=  mx[ii + i] != 0;
+                } else {
+                    taa = 1;
+                    for (i=0; i<n; ++i)
+                        taa &=  mx[ii + i] != 0;
+                }
+
+                // Check result
+                for (i=0; i<n; ++i) {
+                    raa = my[ii+i] != 0;
+                    if (raa != taa) {
+                        log_error("ERROR: sub_group_%s mismatch for local id %d in sub group %d in group %d\n",
+                                   Which == 0 ? "any" : "all", i, j, k);
+                        return -1;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 2*nw;
+        }
+
+        return 0;
+    }
+};
+
+// Reduce functions
+template <typename Ty, int Which>
+struct RED {
+    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+    {
+        int i, ii, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+
+        ii = 0;
+        for (k=0; k<ng; ++k) {
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                for (i=0; i<n; ++i)
+                    t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
+            }
+
+            // Now map into work group using map from device
+            for (j=0;j<nw;++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                x[j] = t[i];
+            }
+
+            x += nw;
+        m += 2*nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
+    {
+        int ii, i, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+        Ty tr, rr;
+
+        log_info("  sub_group_reduce_%s(%s)...\n", Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val());
+
+        for (k=0; k<ng; ++k) {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j=0; j<nw; ++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                mx[i] = x[j];
+                my[i] = y[j];
+            }
+
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                // Compute target
+                if (Which == 0) {
+                    // add
+                    tr = mx[ii];
+                    for (i=1; i<n; ++i)
+                        tr +=  mx[ii + i];
+                } else if (Which == 1) {
+                    // max
+                    tr = mx[ii];
+                    for (i=1; i<n; ++i)
+                        tr = tr > mx[ii + i] ? tr : mx[ii + i];
+                } else if (Which == 2) {
+                    // min
+                    tr = mx[ii];
+                    for (i=1; i<n; ++i)
+                        tr = tr > mx[ii + i] ? mx[ii + i] : tr;
+                }
+
+                // Check result
+                for (i=0; i<n; ++i) {
+                    rr = my[ii+i];
+                    if (rr != tr) {
+                        log_error("ERROR: sub_group_reduce_%s(%s) mismatch for local id %d in sub group %d in group %d\n",
+                                   Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val(), i, j, k);
+                        return -1;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 2*nw;
+        }
+
+        return 0;
+    }
+};
+
+// Scan Inclusive functions
+template <typename Ty, int Which>
+struct SCIN {
+    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+    {
+        int i, ii, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+
+        ii = 0;
+        for (k=0; k<ng; ++k) {
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                for (i=0; i<n; ++i)
+                    // t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
+                    t[ii+i] = (Ty)i;
+            }
+
+            // Now map into work group using map from device
+            for (j=0;j<nw;++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                x[j] = t[i];
+            }
+
+            x += nw;
+        m += 2*nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
+    {
+        int ii, i, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+        Ty tr, rr;
+
+        log_info("  sub_group_scan_inclusive_%s(%s)...\n",  Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val());
+
+        for (k=0; k<ng; ++k) {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j=0; j<nw; ++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                mx[i] = x[j];
+                my[i] = y[j];
+            }
+
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                // Check result
+                for (i=0; i<n; ++i) {
+                    if (Which == 0) {
+                        tr = i == 0 ? mx[ii] : tr + mx[ii + i];
+                    } else if (Which == 1) {
+                        tr = i == 0 ? mx[ii] : (tr > mx[ii + i] ? tr : mx[ii + i]);
+                    } else {
+                        tr = i == 0 ? mx[ii] : (tr > mx[ii + i] ? mx[ii + i] : tr);
+                    }
+
+                    rr = my[ii+i];
+                    if (rr != tr) {
+                        log_error("ERROR: sub_group_scan_inclusive_%s(%s) mismatch for local id %d in sub group %d in group %d\n",
+                                   Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val(), i, j, k);
+                        return -1;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 2*nw;
+        }
+
+        return 0;
+    }
+};
+
+// Scan Exclusive functions
+template <typename Ty, int Which>
+struct SCEX {
+    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+    {
+        int i, ii, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+
+        ii = 0;
+        for (k=0; k<ng; ++k) {
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                for (i=0; i<n; ++i)
+                    t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
+            }
+
+            // Now map into work group using map from device
+            for (j=0;j<nw;++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                x[j] = t[i];
+            }
+
+            x += nw;
+        m += 2*nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
+    {
+        int ii, i, j, k, n;
+        int nj = (nw + ns - 1)/ns;
+        Ty tr, trt, rr;
+
+        log_info("  sub_group_scan_exclusive_%s(%s)...\n", Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val());
+
+        for (k=0; k<ng; ++k) {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j=0; j<nw; ++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                mx[i] = x[j];
+                my[i] = y[j];
+            }
+
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+
+                // Check result
+                for (i=0; i<n; ++i) {
+                    if (Which == 0) {
+                        tr = i == 0 ? TypeIdentity<Ty,Which>::val() : tr + trt;
+                    } else if (Which == 1) {
+                        tr = i == 0 ? TypeIdentity<Ty,Which>::val() : (trt > tr ? trt : tr);
+                    } else {
+                        tr = i == 0 ? TypeIdentity<Ty,Which>::val() : (trt > tr ? tr : trt);
+                    }
+                    trt = mx[ii+i];
+                    rr = my[ii+i];
+
+                    if (rr != tr) {
+                        log_error("ERROR: sub_group_scan_exclusive_%s(%s) mismatch for local id %d in sub group %d in group %d\n",
+                                   Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val(), i, j, k);
+                        return -1;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 2*nw;
+        }
+
+        return 0;
+    }
+};
+
+// Broadcast functios
+template <typename Ty>
+struct BC {
+    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
+    {
+        int i, ii, j, k, l, n;
+        int nj = (nw + ns - 1)/ns;
+        int d = ns > 100 ? 100 : ns;
+
+        ii = 0;
+        for (k=0; k<ng; ++k) {
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                l = (int)(genrand_int32(gMTdata) & 0x7fffffff) % (d > n ? n : d);
+
+                for (i=0; i<n; ++i)
+                    t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % 100 * 100 + l);
+            }
+
+            // Now map into work group using map from device
+            for (j=0;j<nw;++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                x[j] = t[i];
+            }
+
+            x += nw;
+        m += 2*nw;
+        }
+    }
+
+    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
+    {
+        int ii, i, j, k, l, n;
+        int nj = (nw + ns - 1)/ns;
+        Ty tr, rr;
+
+        log_info("  sub_group_broadcast(%s)...\n", TypeName<Ty>::val());
+
+        for (k=0; k<ng; ++k) {
+            // Map to array indexed to array indexed by local ID and sub group
+            for (j=0; j<nw; ++j) {
+                i = m[2*j+1]*ns + m[2*j];
+                mx[i] = x[j];
+                my[i] = y[j];
+            }
+
+            for (j=0; j<nj; ++j) {
+                ii = j*ns;
+                n = ii + ns > nw ? nw - ii : ns;
+                l = (int)mx[ii] % 100;
+                tr = mx[ii+l];
+
+                // Check result
+                for (i=0; i<n; ++i) {
+                    rr = my[ii+i];
+                    if (rr != tr) {
+                        log_error("ERROR: sub_group_broadcast(%s) mismatch for local id %d in sub group %d in group %d\n",
+                                   TypeName<Ty>::val(), i, j, k);
+                        return -1;
+                    }
+                }
+            }
+
+            x += nw;
+            y += nw;
+            m += 2*nw;
+        }
+
+        return 0;
+    }
+};
+
+// Independent forward progress stuff
+// Note:
+//   Output needs num_groups * NUM_LOC elements
+//   local_size must be > NUM_LOC
+//   Input needs num_groups * num_sub_groups * (NUM_LOC+1) elements
+
+static inline int
+inst(int op, int loc, int val)
+{
+    return (val << INST_VAL_SHIFT) | (loc << INST_LOC_SHIFT) | (op << INST_OP_SHIFT);
+}
+
+void gen_insts(cl_int *x, cl_int *p, int n)
+{
+    int i, j0, j1;
+    int val;
+    int ii[NUM_LOC];
+
+    // Create a random permutation of 0...NUM_LOC-1
+    ii[0] = 0;
+    for (i=1; i<NUM_LOC;++i) {
+        j0 = random_in_range(0, i, gMTdata);
+        if (j0 != i)
+            ii[i] = ii[j0];
+        ii[j0] = i;
+    }
+
+    // Initialize "instruction pointers"
+    memset(p, 0, n*4);
+
+    for (i=0; i<NUM_LOC; ++i) {
+        // Randomly choose 2 different sub groups
+        // One does a random amount of work, and the other waits for it
+        j0 = random_in_range(0, n-1, gMTdata);
+
+        do
+            j1 = random_in_range(0, n-1, gMTdata);
+        while (j1 == j0);
+
+        // Randomly choose a wait value and assign "instructions"
+        val = random_in_range(100, 200 + 10*NUM_LOC, gMTdata);
+        x[j0*(NUM_LOC+1) + p[j0]] = inst(INST_COUNT, ii[i], val);
+        x[j1*(NUM_LOC+1) + p[j1]] = inst(INST_WAIT,  ii[i], val);
+        ++p[j0];
+        ++p[j1];
+    }
+
+    // Last "inst" for each sub group is END
+    for (i=0; i<n; ++i)
+        x[i*(NUM_LOC+1) + p[i]] = inst(INST_END, 0, 0);
+}
+
+// Execute one group's "instructions"
+void run_insts(cl_int *x, cl_int *p, int n)
+{
+    int i, nend;
+    bool scont;
+    cl_int loc[NUM_LOC];
+
+    // Initialize result and "instruction pointers"
+    memset(loc, 0, sizeof(loc));
+    memset(p, 0, 4*n);
+
+    // Repetitively loop over subgroups with each executing "instructions" until blocked
+    // The loop terminates when all subgroups have hit the "END instruction"
+    do {
+        nend = 0;
+        for (i=0; i<n; ++i) {
+            do {
+                cl_int inst = x[i*(NUM_LOC+1) + p[i]];
+                cl_int iop = (inst >> INST_OP_SHIFT) & INST_OP_MASK;
+                cl_int iloc = (inst >> INST_LOC_SHIFT) & INST_LOC_MASK;
+                cl_int ival = (inst >> INST_VAL_SHIFT) & INST_VAL_MASK;
+                scont = false;
+
+                switch (iop) {
+                case INST_STORE:
+                    loc[iloc] = ival;
+                    ++p[i];
+                    scont = true;
+                    break;
+                case INST_WAIT:
+                    if (loc[iloc] == ival) {
+                        ++p[i];
+                        scont = true;
+                    }
+                    break;
+                case INST_COUNT:
+                    loc[iloc] += ival;
+                    ++p[i];
+                    scont = true;
+                    break;
+                case INST_END:
+                    ++nend;
+                    break;
+                }
+            } while (scont);
+        }
+    } while (nend < n);
+
+    // Return result, reusing "p"
+    memcpy(p, loc, sizeof(loc));
+}
+
+
+struct IFP {
+    static void gen(cl_int *x, cl_int *t, cl_int *, int ns, int nw, int ng)
+    {
+        int k;
+        int nj = (nw + ns - 1) / ns;
+
+        // We need at least 2 sub groups per group for this test
+        if (nj == 1)
+            return;
+
+        for (k=0; k<ng; ++k) {
+            gen_insts(x, t, nj);
+            x += nj * (NUM_LOC+1);
+        }
+    }
+
+    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, int ns, int nw, int ng)
+    {
+        int i, k;
+        int nj = (nw + ns - 1) / ns;
+
+        // We need at least 2 sub groups per group for this tes
+        if (nj == 1)
+            return 0;
+
+        log_info("  independent forward progress...\n");
+
+        for (k=0; k<ng; ++k) {
+            run_insts(x, t, nj);
+            for (i=0; i<NUM_LOC; ++i) {
+                if (t[i] != y[i]) {
+                    log_error("ERROR: mismatch at element %d in work group %d\n", i, k);
+                    return -1;
+                }
+            }
+            x += nj * (NUM_LOC+1);
+            y += NUM_LOC;
+        }
+
+        return 0;
+    }
+};
+
+
+// Entry point from main
+int
+test_work_group_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+
+    // Adjust these individually below if desired/needed
+#define G 2000
+#define L 200
+
+    error = test<int, AA<0>, G, L>::run(device, context, queue, num_elements, "test_any", any_source);
+    error |= test<int, AA<1>, G, L>::run(device, context, queue, num_elements, "test_all", all_source);
+
+    // error |= test<cl_half, BC<cl_half>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
+    error |= test<cl_uint, BC<cl_uint>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
+    error |= test<cl_int, BC<cl_int>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
+    error |= test<cl_ulong, BC<cl_ulong>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
+    error |= test<cl_long, BC<cl_long>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
+    error |= test<float, BC<float>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
+    error |= test<double, BC<double>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
+
+    // error |= test<cl_half, RED<cl_half,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
+    error |= test<cl_uint, RED<cl_uint,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
+    error |= test<cl_int, RED<cl_int,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
+    error |= test<cl_ulong, RED<cl_ulong,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
+    error |= test<cl_long, RED<cl_long,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
+    error |= test<float, RED<float,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
+    error |= test<double, RED<double,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
+
+    // error |= test<cl_half, RED<cl_half,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
+    error |= test<cl_uint, RED<cl_uint,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
+    error |= test<cl_int, RED<cl_int,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
+    error |= test<cl_ulong, RED<cl_ulong,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
+    error |= test<cl_long, RED<cl_long,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
+    error |= test<float, RED<float,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
+    error |= test<double, RED<double,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
+
+    // error |= test<cl_half, RED<cl_half,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
+    error |= test<cl_uint, RED<cl_uint,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
+    error |= test<cl_int, RED<cl_int,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
+    error |= test<cl_ulong, RED<cl_ulong,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
+    error |= test<cl_long, RED<cl_long,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
+    error |= test<float, RED<float,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
+    error |= test<double, RED<double,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
+
+    // error |= test<cl_half, SCIN<cl_half,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
+    error |= test<cl_uint, SCIN<cl_uint,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
+    error |= test<cl_int, SCIN<cl_int,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
+    error |= test<cl_ulong, SCIN<cl_ulong,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
+    error |= test<cl_long, SCIN<cl_long,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
+    error |= test<float, SCIN<float,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
+    error |= test<double, SCIN<double,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
+
+    // error |= test<cl_half, SCIN<cl_half,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
+    error |= test<cl_uint, SCIN<cl_uint,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
+    error |= test<cl_int, SCIN<cl_int,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
+    error |= test<cl_ulong, SCIN<cl_ulong,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
+    error |= test<cl_long, SCIN<cl_long,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
+    error |= test<float, SCIN<float,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
+    error |= test<double, SCIN<double,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
+
+    // error |= test<cl_half, SCIN<cl_half,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
+    error |= test<cl_uint, SCIN<cl_uint,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
+    error |= test<cl_int, SCIN<cl_int,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
+    error |= test<cl_ulong, SCIN<cl_ulong,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
+    error |= test<cl_long, SCIN<cl_long,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
+    error |= test<float, SCIN<float,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
+    error |= test<double, SCIN<double,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
+
+    // error |= test<cl_half, SCEX<cl_half,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
+    error |= test<cl_uint, SCEX<cl_uint,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
+    error |= test<cl_int, SCEX<cl_int,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
+    error |= test<cl_ulong, SCEX<cl_ulong,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
+    error |= test<cl_long, SCEX<cl_long,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
+    error |= test<float, SCEX<float,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
+    error |= test<double, SCEX<double,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
+
+    // error |= test<cl_half, SCEX<cl_half,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
+    error |= test<cl_uint, SCEX<cl_uint,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
+    error |= test<cl_int, SCEX<cl_int,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
+    error |= test<cl_ulong, SCEX<cl_ulong,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
+    error |= test<cl_long, SCEX<cl_long,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
+    error |= test<float, SCEX<float,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
+    error |= test<double, SCEX<double,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
+
+    // error |= test<cl_half, SCEX<cl_half,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
+    error |= test<cl_uint, SCEX<cl_uint,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
+    error |= test<cl_int, SCEX<cl_int,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
+    error |= test<cl_ulong, SCEX<cl_ulong,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
+    error |= test<cl_long, SCEX<cl_long,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
+    error |= test<float, SCEX<float,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
+    error |= test<double, SCEX<double,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
+
+    error |= test<cl_int, IFP, G, L>::run(device, context, queue, num_elements, "test_ifp", ifp_source, NUM_LOC + 1);
+    return error;
+}
+
--- a/test_conformance/subgroups/test_workitem.cpp
+++ b/test_conformance/subgroups/test_workitem.cpp
@@ -0,0 +1,252 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+struct get_test_data {
+    cl_uint subGroupSize;
+    cl_uint maxSubGroupSize;
+    cl_uint numSubGroups;
+    cl_uint enqNumSubGroups;
+    cl_uint subGroupId;
+    cl_uint subGroupLocalId;
+    bool operator==(get_test_data x) {
+        return subGroupSize == x.subGroupSize &&
+               maxSubGroupSize == x.maxSubGroupSize &&
+               numSubGroups == x.numSubGroups &&
+               subGroupId == x.subGroupId &&
+               subGroupLocalId == x.subGroupLocalId;
+    }
+};
+
+static const char * get_test_source =
+"#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n"
+"\n"
+"typedef struct {\n"
+"    uint subGroupSize;\n"
+"    uint maxSubGroupSize;\n"
+"    uint numSubGroups;\n"
+"    uint enqNumSubGroups;\n"
+"    uint subGroupId;\n"
+"    uint subGroupLocalId;\n"
+"} get_test_data;\n"
+"\n"
+"__kernel void get_test( __global get_test_data *outData )\n"
+"{\n"
+"    int gid = get_global_id( 0 );\n"
+"    outData[gid].subGroupSize = get_sub_group_size();\n"
+"    outData[gid].maxSubGroupSize = get_max_sub_group_size();\n"
+"    outData[gid].numSubGroups = get_num_sub_groups();\n"
+"    outData[gid].enqNumSubGroups = get_enqueued_num_sub_groups();\n"
+"    outData[gid].subGroupId = get_sub_group_id();\n"
+"    outData[gid].subGroupLocalId = get_sub_group_local_id();\n"
+"}";
+
+static int
+check_group(const get_test_data *result, int nw, cl_uint ensg, int maxwgs)
+{
+    int first = -1;
+    int last = -1;
+    int i, j;
+    cl_uint hit[32];
+
+    for (i=0; i<nw; ++i) {
+        if (result[i].subGroupId == 0 && result[i].subGroupLocalId == 0)
+            first = i;
+        if (result[i].subGroupId == result[0].numSubGroups-1 && result[i].subGroupLocalId == 0)
+            last = i;
+        if (first != -1 && last != -1)
+            break;
+    }
+
+    if (first == -1 || last == -1) {
+        log_error("ERROR: expected sub group id's are missing\n");
+        return -1;
+    }
+
+    // Check them
+    if (result[first].subGroupSize == 0) {
+        log_error("ERROR: get_sub_group_size() returned 0\n");
+        return -1;
+    }
+    if (result[first].maxSubGroupSize == 0 || result[first].maxSubGroupSize > maxwgs) {
+        log_error("ERROR: get_max_subgroup_size() returned incorrect result: %u\n", result[first].maxSubGroupSize);
+        return -1;
+    }
+    if (result[first].subGroupSize > result[first].maxSubGroupSize) {
+        log_error("ERROR: get_sub_group_size() > get_max_sub_group_size()\n");
+        return -1;
+    }
+    if (result[last].subGroupSize > result[first].subGroupSize) {
+        log_error("ERROR: last sub group larger than first sub group\n");
+        return -1;
+    }
+    if (result[first].numSubGroups == 0 || result[first].numSubGroups > ensg) {
+        log_error("ERROR: get_num_sub_groups() returned incorrect result:  %u \n", result[first].numSubGroups);
+        return -1;
+    }
+
+    memset(hit, 0, sizeof(hit));
+    for (i=0; i<nw; ++i) {
+        if (result[i].maxSubGroupSize != result[first].maxSubGroupSize ||
+            result[i].numSubGroups != result[first].numSubGroups) {
+            log_error("ERROR: unexpected variation in get_*_sub_group_*()\n");
+            return -1;
+        }
+        if (result[i].subGroupId >= result[first].numSubGroups) {
+            log_error("ERROR: get_sub_group_id() returned out of range value: %u\n", result[i].subGroupId);
+            return -1;
+        }
+        if (result[i].enqNumSubGroups != ensg) {
+            log_error("ERROR: get_enqueued_num_sub_groups() returned incorrect value: %u\n", result[i].enqNumSubGroups);
+            return -1;
+        }
+        if (result[first].numSubGroups > 1) {
+            if (result[i].subGroupId < result[first].numSubGroups-1) {
+                if (result[i].subGroupSize != result[first].subGroupSize) {
+                    log_error("ERROR: unexpected variation in get_*_sub_group_*()\n");
+                    return -1;
+                }
+                if (result[i].subGroupLocalId >= result[first].subGroupSize) {
+                    log_error("ERROR: get_sub_group_local_id() returned out of bounds value: %u \n", result[i].subGroupLocalId);
+                    return -1;
+                }
+            } else {
+                if (result[i].subGroupSize != result[last].subGroupSize) {
+                    log_error("ERROR: unexpected variation in get_*_sub_group_*()\n");
+                    return -1;
+                }
+                if (result[i].subGroupLocalId >= result[last].subGroupSize) {
+                    log_error("ERROR: get_sub_group_local_id() returned out of bounds value: %u \n", result[i].subGroupLocalId);
+                    return -1;
+                }
+            }
+        } else {
+            if (result[i].subGroupSize != result[first].subGroupSize) {
+                log_error("ERROR: unexpected variation in get_*_sub_group_*()\n");
+                return -1;
+            }
+            if (result[i].subGroupLocalId >= result[first].subGroupSize) {
+                log_error("ERROR: get_sub_group_local_id() returned out of bounds value: %u \n", result[i].subGroupLocalId);
+                return -1;
+            }
+        }
+
+        j = (result[first].subGroupSize + 31)/32 * result[i].subGroupId + (result[i].subGroupLocalId >> 5);
+        if (j < sizeof(hit)/4) {
+            cl_uint b = 1U << (result[i].subGroupLocalId & 0x1fU);
+            if ((hit[j] & b) != 0) {
+                log_error("ERROR: get_sub_group_local_id() repeated a result in the same sub group\n");
+                return -1;
+            }
+            hit[j] |= b;
+        }
+    }
+
+    return 0;
+}
+
+int
+test_work_item_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    static const size_t lsize = 200;
+    int error;
+    int i, j, k, q, r, nw;
+    int maxwgs;
+    cl_uint ensg;
+    size_t global;
+    size_t local;
+    get_test_data result[lsize*6];
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper out;
+
+    error = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, &get_test_source, "get_test", "-cl-std=CL2.0");
+    if (error != 0)
+        return error;
+
+    error = get_max_allowed_work_group_size(context, kernel, &local, NULL);
+    if (error != 0)
+        return error;
+
+    maxwgs = (int)local;
+
+    // Limit it a bit so we have muliple work groups
+    // Ideally this will still be large enough to give us multiple subgroups
+    if (local > lsize)
+        local = lsize;
+
+    // Create our buffer
+    out = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(result), NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    // Set argument
+    error = clSetKernelArg(kernel, 0, sizeof(out), &out);
+    test_error(error, "clSetKernelArg failed");
+
+    global = local * 5;
+
+    // Make sure we have a flexible range
+    global += 3 * local / 4;
+
+    // Collect the data
+    memset((void *)&result, 0xf0, sizeof(result));
+
+    error = clEnqueueWriteBuffer(queue, out, CL_FALSE, 0, sizeof(result), (void *)&result, 0, NULL, NULL);
+    test_error(error, "clEnqueueWriteBuffer failed");
+
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    error = clEnqueueReadBuffer(queue, out, CL_FALSE, 0, sizeof(result), (void *)&result, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    error = clFinish(queue);
+    test_error(error, "clFinish failed");
+
+    nw = (int)local;
+    ensg = result[0].enqNumSubGroups;
+
+    // Check the first group
+    error = check_group(result, nw, ensg, maxwgs);
+    if (error)
+        return error;
+
+    q = (int)global / nw;
+    r = (int)global % nw;
+
+    // Check the remaining work groups including the last if it is the same size
+    for (k=1; k<q; ++k) {
+        for (j=0; j<nw; ++j) {
+            i = k*nw + j;
+            if (!(result[i] == result[i-nw])) {
+                log_error("ERROR: sub group mapping is not identical for all work groups\n");
+                return -1;
+            }
+        }
+    }
+
+    // Check the last group if it wasn't the same size
+    if (r != 0) {
+        error = check_group(result + q*nw, r, ensg, maxwgs);
+        if (error)
+            return error;
+    }
+
+    return 0;
+}
+