New subgroups - full changes set (#1074)

* Extended subgroups - extended types types * Extended subgroups - non uniform vote tests * Extended subgroups - non uniform arithmetic tests * Extended subgroups - ballot tests * Extended subgroups - clustered reduce tests * Extended subgroups - shuffle tests * Extended subgroups - formating issues * Extended subgroups - review fixes * Extended subgroups - review fixes Fixed: removed additional brakes, kernel_sstr * Extended subgroups - fix macos build error * Extended subgroups - review fixes Fixed: mac os build error * Extended subgroups - data type verification example * Extended subgroups - error unification * Extended subgroups - fix header years * Extended subgroups - use is_half_nan * Extended subgroups - compare half as float * Review fixes mostly for ballot functions. - Modify kernels for better handling active/inactive workitems - Modify gen/chk functions for handling non uniform workgroup sizes - Introduce new variables naming convention - minor fixes * Extended subgroups - simplification data generation for ballot lsb/msb functions * Extended subgroups - minor fixes * Extended subgroups - move common code to function * Extended subgroups - formatting errors fix * Extended subgroups - fix build error * Extended subgroups - sub_group_elect more sophisticated Define mask which is 4bytes pattern where bit 1 means work item is active. If workitem in subgroup matches pattern then run sub_group_elect() * Extended subgroups - fix Ubuntu build error * Extended subgroups - voting function review fixes * adjust all function for using masks * remove calculate templates * merge code to one common template * check results only in active workitems * normalize values on host side * minor fixes * Extended subgroups - fix typos * Set of fixes and improvements after review * define WorkGroupParams to stop extended parameters list in function * better workitems mask handing (WorkGroupParams) * narrow values of data input generation to avoid overflows (arithmetic func) * implement work item masks for arithmetic functions * enable half type testing for reduction/scan/broadcast * minor fixes * Extended subgroups - fix Linux issues * Extended subgroups - fix sub_group_local_id data type * Extended subgroups - use vector instead of array. * Extended subgroups - change names to subgroup * Extended subgroups - uncomment code, fix build * Extended subgroups - build fix, use cl_half_from_float func * Extended subgroups - remove is_half_nan * Extended subgroups - do no use undef min/max * Extended subgroups - use parenthesis, fix formatting
2026-03-19 06:09:01 +00:00 · 2021-04-06 18:25:48 +02:00
parent c5e4ca6c91
commit 71bef8563e
18 changed files with 5075 additions and 957 deletions
--- a/test_conformance/subgroups/CMakeLists.txt
+++ b/test_conformance/subgroups/CMakeLists.txt
@@ -5,8 +5,16 @@ set(${MODULE_NAME}_SOURCES
    test_barrier.cpp
    test_queries.cpp
    test_workitem.cpp
-    test_workgroup.cpp
+    test_subgroup.cpp
    test_ifp.cpp
    test_subgroup_extended_types.cpp
    subgroup_common_kernels.cpp
    test_subgroup_non_uniform_vote.cpp
    test_subgroup_non_uniform_arithmetic.cpp
    test_subgroup_ballot.cpp
    test_subgroup_clustered_reduce.cpp
    test_subgroup_shuffle.cpp
    test_subgroup_shuffle_relative.cpp
 )
 include(../CMakeCommon.txt)
--- a/test_conformance/subgroups/main.cpp
+++ b/test_conformance/subgroups/main.cpp
@@ -27,12 +27,19 @@ test_definition test_list[] = {
    ADD_TEST_VERSION(sub_group_info_core, Version(2, 1)),
    ADD_TEST_VERSION(work_item_functions_ext, Version(2, 0)),
    ADD_TEST_VERSION(work_item_functions_core, Version(2, 1)),
-    ADD_TEST_VERSION(work_group_functions_ext, Version(2, 0)),
+    ADD_TEST_VERSION(subgroup_functions_ext, Version(2, 0)),
-    ADD_TEST_VERSION(work_group_functions_core, Version(2, 1)),
+    ADD_TEST_VERSION(subgroup_functions_core, Version(2, 1)),
    ADD_TEST_VERSION(barrier_functions_ext, Version(2, 0)),
    ADD_TEST_VERSION(barrier_functions_core, Version(2, 1)),
    ADD_TEST_VERSION(ifp_ext, Version(2, 0)),
-    ADD_TEST_VERSION(ifp_core, Version(2, 1))
+    ADD_TEST_VERSION(ifp_core, Version(2, 1)),
    ADD_TEST(subgroup_functions_extended_types),
    ADD_TEST(subgroup_functions_non_uniform_vote),
    ADD_TEST(subgroup_functions_non_uniform_arithmetic),
    ADD_TEST(subgroup_functions_ballot),
    ADD_TEST(subgroup_functions_clustered_reduce),
    ADD_TEST(subgroup_functions_shuffle),
    ADD_TEST(subgroup_functions_shuffle_relative)
 };
 const int test_num = ARRAY_SIZE(test_list);
--- a/test_conformance/subgroups/procs.h
+++ b/test_conformance/subgroups/procs.h
@@ -37,14 +37,12 @@ extern int test_work_item_functions_core(cl_device_id device,
                                         cl_context context,
                                         cl_command_queue queue,
                                         int num_elements);
-extern int test_work_group_functions_ext(cl_device_id device,
+extern int test_subgroup_functions_ext(cl_device_id device, cl_context context,
-                                         cl_context context,
+                                       cl_command_queue queue,
-                                         cl_command_queue queue,
+                                       int num_elements);
-                                         int num_elements);
+extern int test_subgroup_functions_core(cl_device_id device, cl_context context,
-extern int test_work_group_functions_core(cl_device_id device,
+                                        cl_command_queue queue,
-                                          cl_context context,
+                                        int num_elements);
                                          cl_command_queue queue,
                                          int num_elements);
 extern int test_barrier_functions_ext(cl_device_id device, cl_context context,
                                      cl_command_queue queue, int num_elements);
 extern int test_barrier_functions_core(cl_device_id device, cl_context context,
@@ -56,5 +54,31 @@ extern int test_ifp_ext(cl_device_id device, cl_context context,
                        cl_command_queue queue, int num_elements);
 extern int test_ifp_core(cl_device_id device, cl_context context,
                         cl_command_queue queue, int num_elements);
-
+extern int test_subgroup_functions_extended_types(cl_device_id device,
                                                  cl_context context,
                                                  cl_command_queue queue,
                                                  int num_elements);
 extern int test_subgroup_functions_non_uniform_vote(cl_device_id device,
                                                    cl_context context,
                                                    cl_command_queue queue,
                                                    int num_elements);
 extern int test_subgroup_functions_non_uniform_arithmetic(
    cl_device_id device, cl_context context, cl_command_queue queue,
    int num_elements);
 extern int test_subgroup_functions_ballot(cl_device_id device,
                                          cl_context context,
                                          cl_command_queue queue,
                                          int num_elements);
 extern int test_subgroup_functions_clustered_reduce(cl_device_id device,
                                                    cl_context context,
                                                    cl_command_queue queue,
                                                    int num_elements);
 extern int test_subgroup_functions_shuffle(cl_device_id device,
                                           cl_context context,
                                           cl_command_queue queue,
                                           int num_elements);
 extern int test_subgroup_functions_shuffle_relative(cl_device_id device,
                                                    cl_context context,
                                                    cl_command_queue queue,
                                                    int num_elements);
 #endif /*_procs_h*/
--- a/test_conformance/subgroups/subgroup_common_kernels.cpp
+++ b/test_conformance/subgroups/subgroup_common_kernels.cpp
@@ -0,0 +1,106 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "subgroup_common_kernels.h"
 const char* bcast_source =
    "__kernel void test_bcast(const __global Type *in, "
    "__global int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    Type x = in[gid];\n"
    "    uint which_sub_group_local_id = xy[gid].z;\n"
    "    out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
    "}\n";
 const char* redadd_source = "__kernel void test_redadd(const __global Type "
                            "*in, __global int4 *xy, __global Type *out)\n"
                            "{\n"
                            "    int gid = get_global_id(0);\n"
                            "    XY(xy,gid);\n"
                            "    out[gid] = sub_group_reduce_add(in[gid]);\n"
                            "}\n";
 const char* redmax_source = "__kernel void test_redmax(const __global Type "
                            "*in, __global int4 *xy, __global Type *out)\n"
                            "{\n"
                            "    int gid = get_global_id(0);\n"
                            "    XY(xy,gid);\n"
                            "    out[gid] = sub_group_reduce_max(in[gid]);\n"
                            "}\n";
 const char* redmin_source = "__kernel void test_redmin(const __global Type "
                            "*in, __global int4 *xy, __global Type *out)\n"
                            "{\n"
                            "    int gid = get_global_id(0);\n"
                            "    XY(xy,gid);\n"
                            "    out[gid] = sub_group_reduce_min(in[gid]);\n"
                            "}\n";
 const char* scinadd_source =
    "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
    "}\n";
 const char* scinmax_source =
    "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
    "}\n";
 const char* scinmin_source =
    "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
    "}\n";
 const char* scexadd_source =
    "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
    "}\n";
 const char* scexmax_source =
    "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
    "}\n";
 const char* scexmin_source =
    "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
    "}\n";
--- a/test_conformance/subgroups/subgroup_common_kernels.h
+++ b/test_conformance/subgroups/subgroup_common_kernels.h
@@ -0,0 +1,32 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #ifndef SUBGROUPKERNELSOURCES_H
 #define SUBGROUPKERNELSOURCES_H
 #include "subhelpers.h"
 extern const char* bcast_source;
 extern const char* redadd_source;
 extern const char* redmax_source;
 extern const char* redmin_source;
 extern const char* scinadd_source;
 extern const char* scinmax_source;
 extern const char* scinmin_source;
 extern const char* scexadd_source;
 extern const char* scexmax_source;
 extern const char* scexmin_source;
 #endif
--- a/test_conformance/subgroups/subgroup_common_templates.h
+++ b/test_conformance/subgroups/subgroup_common_templates.h
@@ -0,0 +1,911 @@
 //
 // Copyright (c) 2020 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #ifndef SUBGROUPCOMMONTEMPLATES_H
 #define SUBGROUPCOMMONTEMPLATES_H
 #include "typeWrappers.h"
 #include <bitset>
 #include "CL/cl_half.h"
 #include "subhelpers.h"
 #include <set>
 typedef std::bitset<128> bs128;
 static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
                                  const std::string &mask_type,
                                  cl_uint max_sub_group_size)
 {
    bs128 mask128;
    cl_uint4 mask;
    cl_uint pos = subgroup_local_id;
    if (mask_type == "eq") mask128.set(pos);
    if (mask_type == "le" || mask_type == "lt")
    {
        for (cl_uint i = 0; i <= pos; i++) mask128.set(i);
        if (mask_type == "lt") mask128.reset(pos);
    }
    if (mask_type == "ge" || mask_type == "gt")
    {
        for (cl_uint i = pos; i < max_sub_group_size; i++) mask128.set(i);
        if (mask_type == "gt") mask128.reset(pos);
    }
    // convert std::bitset<128> to uint4
    auto const uint_mask = bs128{ static_cast<unsigned long>(-1) };
    mask.s0 = (mask128 & uint_mask).to_ulong();
    mask128 >>= 32;
    mask.s1 = (mask128 & uint_mask).to_ulong();
    mask128 >>= 32;
    mask.s2 = (mask128 & uint_mask).to_ulong();
    mask128 >>= 32;
    mask.s3 = (mask128 & uint_mask).to_ulong();
    return mask;
 }
 // DESCRIPTION :
 // sub_group_broadcast - each work_item registers it's own value.
 // All work_items in subgroup takes one value from only one (any) work_item
 // sub_group_broadcast_first - same as type 0. All work_items in
 // subgroup takes only one value from only one chosen (the smallest subgroup ID)
 // work_item
 // sub_group_non_uniform_broadcast - same as type 0 but
 // only 4 work_items from subgroup enter the code (are active)
 template <typename Ty, SubgroupsBroadcastOp operation> struct BC
 {
    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
    {
        int i, ii, j, k, n;
        int ng = test_params.global_workgroup_size;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int nj = (nw + ns - 1) / ns;
        int d = ns > 100 ? 100 : ns;
        int non_uniform_size = ng % nw;
        ng = ng / nw;
        int last_subgroup_size = 0;
        ii = 0;
        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
                 TypeManager<Ty>::name());
        if (non_uniform_size)
        {
            log_info("  non uniform work group size mode ON\n");
            ng++;
        }
        for (k = 0; k < ng; ++k)
        { // for each work_group
            if (non_uniform_size && k == ng - 1)
            {
                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
                                          last_subgroup_size);
            }
            for (j = 0; j < nj; ++j)
            { // for each subgroup
                ii = j * ns;
                if (last_subgroup_size && j == nj - 1)
                {
                    n = last_subgroup_size;
                }
                else
                {
                    n = ii + ns > nw ? nw - ii : ns;
                }
                int bcast_if = 0;
                int bcast_elseif = 0;
                int bcast_index = (int)(genrand_int32(gMTdata) & 0x7fffffff)
                    % (d > n ? n : d);
                // l - calculate subgroup local id from which value will be
                // broadcasted (one the same value for whole subgroup)
                if (operation != SubgroupsBroadcastOp::broadcast)
                {
                    // reduce brodcasting index in case of non_uniform and
                    // last workgroup last subgroup
                    if (last_subgroup_size && j == nj - 1
                        && last_subgroup_size < NR_OF_ACTIVE_WORK_ITEMS)
                    {
                        bcast_if = bcast_index % last_subgroup_size;
                        bcast_elseif = bcast_if;
                    }
                    else
                    {
                        bcast_if = bcast_index % NR_OF_ACTIVE_WORK_ITEMS;
                        bcast_elseif = NR_OF_ACTIVE_WORK_ITEMS
                            + bcast_index % (n - NR_OF_ACTIVE_WORK_ITEMS);
                    }
                }
                for (i = 0; i < n; ++i)
                {
                    if (operation == SubgroupsBroadcastOp::broadcast)
                    {
                        int midx = 4 * ii + 4 * i + 2;
                        m[midx] = (cl_int)bcast_index;
                    }
                    else
                    {
                        if (i < NR_OF_ACTIVE_WORK_ITEMS)
                        {
                            // index of the third
                            // element int the vector.
                            int midx = 4 * ii + 4 * i + 2;
                            // storing information about
                            // broadcasting index -
                            // earlier calculated
                            m[midx] = (cl_int)bcast_if;
                        }
                        else
                        { // index of the third
                          // element int the vector.
                            int midx = 4 * ii + 4 * i + 3;
                            m[midx] = (cl_int)bcast_elseif;
                        }
                    }
                    // calculate value for broadcasting
                    cl_ulong number = genrand_int64(gMTdata);
                    set_value(t[ii + i], number);
                }
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            { // for each element in work_group
                // calculate index as number of subgroup
                // plus subgroup local id
                x[j] = t[j];
            }
            x += nw;
            m += 4 * nw;
        }
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, l, n;
        int ng = test_params.global_workgroup_size;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int nj = (nw + ns - 1) / ns;
        Ty tr, rr;
        int non_uniform_size = ng % nw;
        ng = ng / nw;
        int last_subgroup_size = 0;
        if (non_uniform_size) ng++;
        for (k = 0; k < ng; ++k)
        { // for each work_group
            if (non_uniform_size && k == ng - 1)
            {
                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
                                          last_subgroup_size);
            }
            for (j = 0; j < nw; ++j)
            { // inside the work_group
                mx[j] = x[j]; // read host inputs for work_group
                my[j] = y[j]; // read device outputs for work_group
            }
            for (j = 0; j < nj; ++j)
            { // for each subgroup
                ii = j * ns;
                if (last_subgroup_size && j == nj - 1)
                {
                    n = last_subgroup_size;
                }
                else
                {
                    n = ii + ns > nw ? nw - ii : ns;
                }
                // Check result
                if (operation == SubgroupsBroadcastOp::broadcast_first)
                {
                    int lowest_active_id = -1;
                    for (i = 0; i < n; ++i)
                    {
                        lowest_active_id = i < NR_OF_ACTIVE_WORK_ITEMS
                            ? 0
                            : NR_OF_ACTIVE_WORK_ITEMS;
                        //  findout if broadcasted
                        //  value is the same
                        tr = mx[ii + lowest_active_id];
                        //  findout if broadcasted to all
                        rr = my[ii + i];
                        if (!compare(rr, tr))
                        {
                            log_error(
                                "ERROR: sub_group_broadcast_first(%s) "
                                "mismatch "
                                "for local id %d in sub group %d in group "
                                "%d\n",
                                TypeManager<Ty>::name(), i, j, k);
                            return TEST_FAIL;
                        }
                    }
                }
                else
                {
                    for (i = 0; i < n; ++i)
                    {
                        if (operation == SubgroupsBroadcastOp::broadcast)
                        {
                            int midx = 4 * ii + 4 * i + 2;
                            l = (int)m[midx];
                            tr = mx[ii + l];
                        }
                        else
                        {
                            if (i < NR_OF_ACTIVE_WORK_ITEMS)
                            { // take index of array where info
                              // which work_item will be
                              // broadcast its value is stored
                                int midx = 4 * ii + 4 * i + 2;
                                // take subgroup local id of
                                // this work_item
                                l = (int)m[midx];
                                // take value generated on host
                                // for this work_item
                                tr = mx[ii + l];
                            }
                            else
                            {
                                int midx = 4 * ii + 4 * i + 3;
                                l = (int)m[midx];
                                tr = mx[ii + l];
                            }
                        }
                        rr = my[ii + i]; // read device outputs for
                                         // work_item in the subgroup
                        if (!compare(rr, tr))
                        {
                            log_error("ERROR: sub_group_%s(%s) "
                                      "mismatch for local id %d in sub "
                                      "group %d in group %d - got %lu "
                                      "expected %lu\n",
                                      operation_names(operation),
                                      TypeManager<Ty>::name(), i, j, k, rr, tr);
                            return TEST_FAIL;
                        }
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
                 TypeManager<Ty>::name());
        return TEST_PASS;
    }
 };
 static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); }
 static subgroups::cl_half to_half(float x)
 {
    subgroups::cl_half value;
    value.data = cl_half_from_float(x, CL_HALF_RTE);
    return value;
 }
 // for integer types
 template <typename Ty> inline Ty calculate(Ty a, Ty b, ArithmeticOp operation)
 {
    switch (operation)
    {
        case ArithmeticOp::add_: return a + b;
        case ArithmeticOp::max_: return a > b ? a : b;
        case ArithmeticOp::min_: return a < b ? a : b;
        case ArithmeticOp::mul_: return a * b;
        case ArithmeticOp::and_: return a & b;
        case ArithmeticOp::or_: return a | b;
        case ArithmeticOp::xor_: return a ^ b;
        case ArithmeticOp::logical_and: return a && b;
        case ArithmeticOp::logical_or: return a || b;
        case ArithmeticOp::logical_xor: return !a ^ !b;
        default: log_error("Unknown operation request"); break;
    }
    return 0;
 }
 // Specialize for floating points.
 template <>
 inline cl_double calculate(cl_double a, cl_double b, ArithmeticOp operation)
 {
    switch (operation)
    {
        case ArithmeticOp::add_: {
            return a + b;
        }
        case ArithmeticOp::max_: {
            return a > b ? a : b;
        }
        case ArithmeticOp::min_: {
            return a < b ? a : b;
        }
        case ArithmeticOp::mul_: {
            return a * b;
        }
        default: log_error("Unknown operation request"); break;
    }
    return 0;
 }
 template <>
 inline cl_float calculate(cl_float a, cl_float b, ArithmeticOp operation)
 {
    switch (operation)
    {
        case ArithmeticOp::add_: {
            return a + b;
        }
        case ArithmeticOp::max_: {
            return a > b ? a : b;
        }
        case ArithmeticOp::min_: {
            return a < b ? a : b;
        }
        case ArithmeticOp::mul_: {
            return a * b;
        }
        default: log_error("Unknown operation request"); break;
    }
    return 0;
 }
 template <>
 inline subgroups::cl_half calculate(subgroups::cl_half a, subgroups::cl_half b,
                                    ArithmeticOp operation)
 {
    switch (operation)
    {
        case ArithmeticOp::add_: return to_half(to_float(a) + to_float(b));
        case ArithmeticOp::max_:
            return to_float(a) > to_float(b) || is_half_nan(b.data) ? a : b;
        case ArithmeticOp::min_:
            return to_float(a) < to_float(b) || is_half_nan(b.data) ? a : b;
        case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b));
        default: log_error("Unknown operation request"); break;
    }
    return to_half(0);
 }
 template <typename Ty> bool is_floating_point()
 {
    return std::is_floating_point<Ty>::value
        || std::is_same<Ty, subgroups::cl_half>::value;
 }
 template <typename Ty, ArithmeticOp operation>
 void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
 {
    int nj = (nw + ns - 1) / ns;
    for (int k = 0; k < ng; ++k)
    {
        for (int j = 0; j < nj; ++j)
        {
            int ii = j * ns;
            int n = ii + ns > nw ? nw - ii : ns;
            for (int i = 0; i < n; ++i)
            {
                cl_ulong out_value;
                double y;
                if (operation == ArithmeticOp::mul_
                    || operation == ArithmeticOp::add_)
                {
                    // work around to avoid overflow, do not use 0 for
                    // multiplication
                    out_value = (genrand_int32(gMTdata) % 4) + 1;
                }
                else
                {
                    out_value = genrand_int64(gMTdata) % (32 * n);
                    if ((operation == ArithmeticOp::logical_and
                         || operation == ArithmeticOp::logical_or
                         || operation == ArithmeticOp::logical_xor)
                        && ((out_value >> 32) & 1) == 0)
                        out_value = 0; // increase probability of false
                }
                set_value(t[ii + i], out_value);
            }
        }
        // Now map into work group using map from device
        for (int j = 0; j < nw; ++j)
        {
            x[j] = t[j];
        }
        x += nw;
        m += 4 * nw;
    }
 }
 template <typename Ty, ShuffleOp operation> struct SHF
 {
    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
    {
        int i, ii, j, k, l, n, delta;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        int nj = (nw + ns - 1) / ns;
        int d = ns > 100 ? 100 : ns;
        ii = 0;
        ng = ng / nw;
        log_info("  sub_group_%s(%s)...\n", operation_names(operation),
                 TypeManager<Ty>::name());
        for (k = 0; k < ng; ++k)
        { // for each work_group
            for (j = 0; j < nj; ++j)
            { // for each subgroup
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                for (i = 0; i < n; ++i)
                {
                    int midx = 4 * ii + 4 * i + 2;
                    l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
                        % (d > n ? n : d);
                    switch (operation)
                    {
                        case ShuffleOp::shuffle:
                        case ShuffleOp::shuffle_xor:
                            // storing information about shuffle index
                            m[midx] = (cl_int)l;
                            break;
                        case ShuffleOp::shuffle_up:
                            delta = l; // calculate delta for shuffle up
                            if (i - delta < 0)
                            {
                                delta = i;
                            }
                            m[midx] = (cl_int)delta;
                            break;
                        case ShuffleOp::shuffle_down:
                            delta = l; // calculate delta for shuffle down
                            if (i + delta >= n)
                            {
                                delta = n - 1 - i;
                            }
                            m[midx] = (cl_int)delta;
                            break;
                        default: break;
                    }
                    cl_ulong number = genrand_int64(gMTdata);
                    set_value(t[ii + i], number);
                }
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            { // for each element in work_group
                x[j] = t[j];
            }
            x += nw;
            m += 4 * nw;
        }
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, l, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        int nj = (nw + ns - 1) / ns;
        Ty tr, rr;
        ng = ng / nw;
        for (k = 0; k < ng; ++k)
        { // for each work_group
            for (j = 0; j < nw; ++j)
            { // inside the work_group
                mx[j] = x[j]; // read host inputs for work_group
                my[j] = y[j]; // read device outputs for work_group
            }
            for (j = 0; j < nj; ++j)
            { // for each subgroup
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                for (i = 0; i < n; ++i)
                { // inside the subgroup
                  // shuffle index storage
                    int midx = 4 * ii + 4 * i + 2;
                    l = (int)m[midx];
                    rr = my[ii + i];
                    switch (operation)
                    {
                        // shuffle basic - treat l as index
                        case ShuffleOp::shuffle: tr = mx[ii + l]; break;
                        // shuffle up - treat l as delta
                        case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break;
                        // shuffle up - treat l as delta
                        case ShuffleOp::shuffle_down:
                            tr = mx[ii + i + l];
                            break;
                        // shuffle xor - treat l as mask
                        case ShuffleOp::shuffle_xor:
                            tr = mx[ii + (i ^ l)];
                            break;
                        default: break;
                    }
                    if (!compare(rr, tr))
                    {
                        log_error("ERROR: sub_group_%s(%s) mismatch for "
                                  "local id %d in sub group %d in group %d\n",
                                  operation_names(operation),
                                  TypeManager<Ty>::name(), i, j, k);
                        return TEST_FAIL;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  sub_group_%s(%s)... passed\n", operation_names(operation),
                 TypeManager<Ty>::name());
        return TEST_PASS;
    }
 };
 template <typename Ty, ArithmeticOp operation> struct SCEX_NU
 {
    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
    {
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        ng = ng / nw;
        std::string func_name;
        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
                        : func_name = "sub_group_scan_exclusive";
        log_info("  %s_%s(%s)...\n", func_name.c_str(),
                 operation_names(operation), TypeManager<Ty>::name());
        log_info("  test params: global size = %d local size = %d subgroups "
                 "size = %d work item mask = 0x%x \n",
                 test_params.global_workgroup_size, nw, ns, work_items_mask);
        genrand<Ty, operation>(x, t, m, ns, nw, ng);
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        int nj = (nw + ns - 1) / ns;
        Ty tr, rr;
        ng = ng / nw;
        std::string func_name;
        work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
                        : func_name = "sub_group_scan_exclusive";
        uint32_t use_work_items_mask;
        // for uniform case take into consideration all workitems
        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
        for (k = 0; k < ng; ++k)
        { // for each work_group
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            { // inside the work_group
                mx[j] = x[j]; // read host inputs for work_group
                my[j] = y[j]; // read device outputs for work_group
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                std::set<int> active_work_items;
                for (i = 0; i < n; ++i)
                {
                    uint32_t check_work_item = 1 << (i % 32);
                    if (use_work_items_mask & check_work_item)
                    {
                        active_work_items.insert(i);
                    }
                }
                if (active_work_items.empty())
                {
                    log_info("  No acitve workitems in workgroup id = %d "
                             "subgroup id = %d - no calculation\n",
                             k, j);
                    continue;
                }
                else if (active_work_items.size() == 1)
                {
                    log_info("  One active workitem in workgroup id = %d "
                             "subgroup id = %d - no calculation\n",
                             k, j);
                    continue;
                }
                else
                {
                    tr = TypeManager<Ty>::identify_limits(operation);
                    int idx = 0;
                    for (const int &active_work_item : active_work_items)
                    {
                        rr = my[ii + active_work_item];
                        if (idx == 0) continue;
                        if (!compare_ordered(rr, tr))
                        {
                            log_error(
                                "ERROR: %s_%s(%s) "
                                "mismatch for local id %d in sub group %d in "
                                "group %d Expected: %d Obtained: %d\n",
                                func_name.c_str(), operation_names(operation),
                                TypeManager<Ty>::name(), i, j, k, tr, rr);
                            return TEST_FAIL;
                        }
                        tr = calculate<Ty>(tr, mx[ii + active_work_item],
                                           operation);
                        idx++;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
                 operation_names(operation), TypeManager<Ty>::name());
        return TEST_PASS;
    }
 };
 // Test for scan inclusive non uniform functions
 template <typename Ty, ArithmeticOp operation> struct SCIN_NU
 {
    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
    {
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        ng = ng / nw;
        std::string func_name;
        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
                        : func_name = "sub_group_scan_inclusive";
        genrand<Ty, operation>(x, t, m, ns, nw, ng);
        log_info("  %s_%s(%s)...\n", func_name.c_str(),
                 operation_names(operation), TypeManager<Ty>::name());
        log_info("  test params: global size = %d local size = %d subgroups "
                 "size = %d work item mask = 0x%x \n",
                 test_params.global_workgroup_size, nw, ns, work_items_mask);
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        int nj = (nw + ns - 1) / ns;
        Ty tr, rr;
        ng = ng / nw;
        std::string func_name;
        work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
                        : func_name = "sub_group_scan_inclusive";
        uint32_t use_work_items_mask;
        // for uniform case take into consideration all workitems
        use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
        // std::bitset<32> mask32(use_work_items_mask);
        // for (int k) mask32.count();
        for (k = 0; k < ng; ++k)
        { // for each work_group
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            { // inside the work_group
                mx[j] = x[j]; // read host inputs for work_group
                my[j] = y[j]; // read device outputs for work_group
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                std::set<int> active_work_items;
                int catch_frist_active = -1;
                for (i = 0; i < n; ++i)
                {
                    uint32_t check_work_item = 1 << (i % 32);
                    if (use_work_items_mask & check_work_item)
                    {
                        if (catch_frist_active == -1)
                        {
                            catch_frist_active = i;
                        }
                        active_work_items.insert(i);
                    }
                }
                if (active_work_items.empty())
                {
                    log_info("  No acitve workitems in workgroup id = %d "
                             "subgroup id = %d - no calculation\n",
                             k, j);
                    continue;
                }
                else
                {
                    tr = TypeManager<Ty>::identify_limits(operation);
                    for (const int &active_work_item : active_work_items)
                    {
                        rr = my[ii + active_work_item];
                        if (active_work_items.size() == 1)
                        {
                            tr = mx[ii + catch_frist_active];
                        }
                        else
                        {
                            tr = calculate<Ty>(tr, mx[ii + active_work_item],
                                               operation);
                        }
                        if (!compare_ordered<Ty>(rr, tr))
                        {
                            log_error(
                                "ERROR: %s_%s(%s) "
                                "mismatch for local id %d in sub group %d "
                                "in "
                                "group %d Expected: %d Obtained: %d\n",
                                func_name.c_str(), operation_names(operation),
                                TypeManager<Ty>::name(), active_work_item, j, k,
                                tr, rr);
                            return TEST_FAIL;
                        }
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
                 operation_names(operation), TypeManager<Ty>::name());
        return TEST_PASS;
    }
 };
 // Test for reduce non uniform functions
 template <typename Ty, ArithmeticOp operation> struct RED_NU
 {
    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
    {
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        ng = ng / nw;
        std::string func_name;
        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
                        : func_name = "sub_group_reduce";
        log_info("  %s_%s(%s)...\n", func_name.c_str(),
                 operation_names(operation), TypeManager<Ty>::name());
        log_info("  test params: global size = %d local size = %d subgroups "
                 "size = %d work item mask = 0x%x \n",
                 test_params.global_workgroup_size, nw, ns, work_items_mask);
        genrand<Ty, operation>(x, t, m, ns, nw, ng);
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        int nj = (nw + ns - 1) / ns;
        ng = ng / nw;
        Ty tr, rr;
        std::string func_name;
        work_items_mask ? func_name = "sub_group_non_uniform_reduce"
                        : func_name = "sub_group_reduce";
        for (k = 0; k < ng; ++k)
        {
            // Map to array indexed to array indexed by local ID and sub
            // group
            for (j = 0; j < nw; ++j)
            {
                mx[j] = x[j];
                my[j] = y[j];
            }
            uint32_t use_work_items_mask;
            use_work_items_mask =
                !work_items_mask ? 0xFFFFFFFF : work_items_mask;
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                std::set<int> active_work_items;
                int catch_frist_active = -1;
                for (i = 0; i < n; ++i)
                {
                    uint32_t check_work_item = 1 << (i % 32);
                    if (use_work_items_mask & check_work_item)
                    {
                        if (catch_frist_active == -1)
                        {
                            catch_frist_active = i;
                            tr = mx[ii + i];
                            active_work_items.insert(i);
                            continue;
                        }
                        active_work_items.insert(i);
                        tr = calculate<Ty>(tr, mx[ii + i], operation);
                    }
                }
                if (active_work_items.empty())
                {
                    log_info("  No acitve workitems in workgroup id = %d "
                             "subgroup id = %d - no calculation\n",
                             k, j);
                    continue;
                }
                for (const int &active_work_item : active_work_items)
                {
                    rr = my[ii + active_work_item];
                    if (!compare_ordered<Ty>(rr, tr))
                    {
                        log_error("ERROR: %s_%s(%s) "
                                  "mismatch for local id %d in sub group %d in "
                                  "group %d Expected: %d Obtained: %d\n",
                                  func_name.c_str(), operation_names(operation),
                                  TypeManager<Ty>::name(), active_work_item, j,
                                  k, tr, rr);
                        return TEST_FAIL;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  %s_%s(%s)... passed\n", func_name.c_str(),
                 operation_names(operation), TypeManager<Ty>::name());
        return TEST_PASS;
    }
 };
 #endif
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
--- a/test_conformance/subgroups/test_barrier.cpp
+++ b/test_conformance/subgroups/test_barrier.cpp
@@ -59,10 +59,15 @@ static const char *gbar_source =
 // barrier test functions
 template <int Which> struct BAR
 {
-    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
+    static void gen(cl_int *x, cl_int *t, cl_int *m,
                    const WorkGroupParams &test_params)
    {
        int i, ii, j, k, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        int nj = (nw + ns - 1) / ns;
        ng = ng / nw;
        int e;
        ii = 0;
@@ -79,8 +84,7 @@ template <int Which> struct BAR
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
-                i = m[2 * j + 1] * ns + m[2 * j];
+                x[j] = t[j];
                x[j] = t[i];
            }
            x += nw;
@@ -89,10 +93,14 @@ template <int Which> struct BAR
    }
    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
-                   int ns, int nw, int ng)
+                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        int nj = (nw + ns - 1) / ns;
        ng = ng / nw;
        cl_int tr, rr;
        if (Which == 0)
@@ -105,9 +113,8 @@ template <int Which> struct BAR
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            {
-                i = m[2 * j + 1] * ns + m[2 * j];
+                mx[j] = x[j];
-                mx[i] = x[j];
+                my[j] = y[j];
                my[i] = y[j];
            }
            for (j = 0; j < nj; ++j)
@@ -123,8 +130,9 @@ template <int Which> struct BAR
                    if (tr != rr)
                    {
                        log_error("ERROR: sub_group_barrier mismatch for local "
-                                  "id %d in sub group %d in group %d\n",
+                                  "id %d in sub group %d in group %d expected "
-                                  i, j, k);
+                                  "%d got %d\n",
                                  i, j, k, tr, rr);
                        return -1;
                    }
                }
@@ -144,18 +152,18 @@ int test_barrier_functions(cl_device_id device, cl_context context,
                           cl_command_queue queue, int num_elements,
                           bool useCoreSubgroups)
 {
-    int error;
+    int error = TEST_PASS;
    // Adjust these individually below if desired/needed
-#define G 2000
+    constexpr size_t global_work_size = 2000;
-#define L 200
+    constexpr size_t local_work_size = 200;
-
+    WorkGroupParams test_params(global_work_size, local_work_size);
-    error = test<cl_int, BAR<0>, G, L>::run(device, context, queue,
+    test_params.use_core_subgroups = useCoreSubgroups;
-                                            num_elements, "test_lbar",
+    error = test<cl_int, BAR<0>>::run(device, context, queue, num_elements,
-                                            lbar_source, 0, useCoreSubgroups);
+                                      "test_lbar", lbar_source, test_params);
-    error = test<cl_int, BAR<1>, G, L, G>::run(
+    error |= test<cl_int, BAR<1>, global_work_size>::run(
-        device, context, queue, num_elements, "test_gbar", gbar_source, 0,
+        device, context, queue, num_elements, "test_gbar", gbar_source,
-        useCoreSubgroups);
+        test_params);
    return error;
 }
--- a/test_conformance/subgroups/test_ifp.cpp
+++ b/test_conformance/subgroups/test_ifp.cpp
@@ -46,7 +46,7 @@ static const char *ifp_source =
    "#define INST_COUNT 0x3\n"
    "\n"
    "__kernel void\n"
-    "test_ifp(const __global int *in, __global int2 *xy, __global int *out)\n"
+    "test_ifp(const __global int *in, __global int4 *xy, __global int *out)\n"
    "{\n"
    "    __local atomic_int loc[NUM_LOC];\n"
    "\n"
@@ -225,10 +225,15 @@ void run_insts(cl_int *x, cl_int *p, int n)
 struct IFP
 {
-    static void gen(cl_int *x, cl_int *t, cl_int *, int ns, int nw, int ng)
+    static void gen(cl_int *x, cl_int *t, cl_int *,
                    const WorkGroupParams &test_params)
    {
        int k;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        int nj = (nw + ns - 1) / ns;
        ng = ng / nw;
        // We need at least 2 sub groups per group for this test
        if (nj == 1) return;
@@ -240,11 +245,15 @@ struct IFP
        }
    }
-    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, int ns,
+    static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
-                   int nw, int ng)
+                   const WorkGroupParams &test_params)
    {
        int i, k;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        int nj = (nw + ns - 1) / ns;
        ng = ng / nw;
        // We need at least 2 sub groups per group for this tes
        if (nj == 1) return 0;
@@ -275,14 +284,17 @@ struct IFP
 int test_ifp(cl_device_id device, cl_context context, cl_command_queue queue,
             int num_elements, bool useCoreSubgroups)
 {
-    int error;
+    int error = TEST_PASS;
    // Global/local work group sizes
    // Adjust these individually below if desired/needed
-#define G 2000
+    constexpr size_t global_work_size = 2000;
-#define L 200
+    constexpr size_t local_work_size = 200;
-    error = test<cl_int, IFP, G, L>::run(device, context, queue, num_elements,
+    WorkGroupParams test_params(global_work_size, local_work_size);
-                                         "test_ifp", ifp_source, NUM_LOC + 1,
+    test_params.use_core_subgroups = useCoreSubgroups;
-                                         useCoreSubgroups);
+    test_params.dynsc = NUM_LOC + 1;
    error = test<cl_int, IFP>::run(device, context, queue, num_elements,
                                   "test_ifp", ifp_source, test_params);
    return error;
 }
--- a/test_conformance/subgroups/test_subgroup.cpp
+++ b/test_conformance/subgroups/test_subgroup.cpp
@@ -0,0 +1,217 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "subgroup_common_kernels.h"
 #include "subgroup_common_templates.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 namespace {
 // Any/All test functions
 template <NonUniformVoteOp operation> struct AA
 {
    static void gen(cl_int *x, cl_int *t, cl_int *m,
                    const WorkGroupParams &test_params)
    {
        int i, ii, j, k, n;
        int ng = test_params.global_workgroup_size;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int nj = (nw + ns - 1) / ns;
        int e;
        ng = ng / nw;
        ii = 0;
        log_info("  sub_group_%s...\n", operation_names(operation));
        for (k = 0; k < ng; ++k)
        {
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                e = (int)(genrand_int32(gMTdata) % 3);
                // Initialize data matrix indexed by local id and sub group id
                switch (e)
                {
                    case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
                    case 1:
                        memset(&t[ii], 0, n * sizeof(cl_int));
                        i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
                        t[ii + i] = 41;
                        break;
                    case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
                }
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
                x[j] = t[j];
            }
            x += nw;
            m += 4 * nw;
        }
    }
    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, n;
        int ng = test_params.global_workgroup_size;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int nj = (nw + ns - 1) / ns;
        cl_int taa, raa;
        ng = ng / nw;
        for (k = 0; k < ng; ++k)
        {
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            {
                mx[j] = x[j];
                my[j] = y[j];
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                // Compute target
                if (operation == NonUniformVoteOp::any)
                {
                    taa = 0;
                    for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
                }
                if (operation == NonUniformVoteOp::all)
                {
                    taa = 1;
                    for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
                }
                // Check result
                for (i = 0; i < n; ++i)
                {
                    raa = my[ii + i] != 0;
                    if (raa != taa)
                    {
                        log_error("ERROR: sub_group_%s mismatch for local id "
                                  "%d in sub group %d in group %d\n",
                                  operation_names(operation), i, j, k);
                        return TEST_FAIL;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  sub_group_%s... passed\n", operation_names(operation));
        return TEST_PASS;
    }
 };
 static const char *any_source = "__kernel void test_any(const __global Type "
                                "*in, __global int4 *xy, __global Type *out)\n"
                                "{\n"
                                "    int gid = get_global_id(0);\n"
                                "    XY(xy,gid);\n"
                                "    out[gid] = sub_group_any(in[gid]);\n"
                                "}\n";
 static const char *all_source = "__kernel void test_all(const __global Type "
                                "*in, __global int4 *xy, __global Type *out)\n"
                                "{\n"
                                "    int gid = get_global_id(0);\n"
                                "    XY(xy,gid);\n"
                                "    out[gid] = sub_group_all(in[gid]);\n"
                                "}\n";
 template <typename T>
 int run_broadcast_scan_reduction_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
        "test_bcast", bcast_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
                                                            redadd_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
                                                            redmax_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
                                                            redmin_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
                                                             scinadd_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
                                                             scinmax_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
                                                             scinmin_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
                                                             scexadd_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
                                                             scexmax_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
                                                             scexmin_source);
    return error;
 }
 }
 // Entry point from main
 int test_subgroup_functions(cl_device_id device, cl_context context,
                            cl_command_queue queue, int num_elements,
                            bool useCoreSubgroups)
 {
    constexpr size_t global_work_size = 2000;
    constexpr size_t local_work_size = 200;
    WorkGroupParams test_params(global_work_size, local_work_size);
    RunTestForType rft(device, context, queue, num_elements, test_params);
    int error =
        rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
    error |=
        rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
    error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
    error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
    error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
    error |= run_broadcast_scan_reduction_for_type<cl_ulong>(rft);
    error |= run_broadcast_scan_reduction_for_type<cl_float>(rft);
    error |= run_broadcast_scan_reduction_for_type<cl_double>(rft);
    error |= run_broadcast_scan_reduction_for_type<subgroups::cl_half>(rft);
    return error;
 }
 int test_subgroup_functions_core(cl_device_id device, cl_context context,
                                 cl_command_queue queue, int num_elements)
 {
    return test_subgroup_functions(device, context, queue, num_elements, true);
 }
 int test_subgroup_functions_ext(cl_device_id device, cl_context context,
                                cl_command_queue queue, int num_elements)
 {
    bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
    if (!hasExtension)
    {
        log_info(
            "Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
        return TEST_SKIPPED_ITSELF;
    }
    return test_subgroup_functions(device, context, queue, num_elements, false);
 }
--- a/test_conformance/subgroups/test_subgroup_ballot.cpp
+++ b/test_conformance/subgroups/test_subgroup_ballot.cpp
--- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
+++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp
@@ -0,0 +1,340 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "subgroup_common_templates.h"
 #include "harness/typeWrappers.h"
 #define CLUSTER_SIZE 4
 #define CLUSTER_SIZE_STR "4"
 namespace {
 static const char *redadd_clustered_source =
    "__kernel void test_redadd_clustered(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
    ");\n"
    "}\n";
 static const char *redmax_clustered_source =
    "__kernel void test_redmax_clustered(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
    ");\n"
    "}\n";
 static const char *redmin_clustered_source =
    "__kernel void test_redmin_clustered(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
    ");\n"
    "}\n";
 static const char *redmul_clustered_source =
    "__kernel void test_redmul_clustered(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
    ");\n"
    "}\n";
 static const char *redand_clustered_source =
    "__kernel void test_redand_clustered(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
    ");\n"
    "}\n";
 static const char *redor_clustered_source =
    "__kernel void test_redor_clustered(const __global Type *in, __global int4 "
    "*xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
    ");\n"
    "}\n";
 static const char *redxor_clustered_source =
    "__kernel void test_redxor_clustered(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
    ");\n"
    "}\n";
 static const char *redand_clustered_logical_source =
    "__kernel void test_redand_clustered_logical(const __global Type *in, "
    "__global int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
    ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = "
    "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
    "}\n";
 static const char *redor_clustered_logical_source =
    "__kernel void test_redor_clustered_logical(const __global Type *in, "
    "__global int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if (sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
    ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = "
    "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
    "}\n";
 static const char *redxor_clustered_logical_source =
    "__kernel void test_redxor_clustered_logical(const __global Type *in, "
    "__global int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    xy[gid].w = 0;\n"
    "    if ( sizeof(in[gid]) == "
    "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
    ")))\n"
    "    {xy[gid].w = sizeof(in[gid]);}\n"
    "    out[gid] = "
    "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
    "}\n";
 // DESCRIPTION:
 // Test for reduce cluster functions
 template <typename Ty, ArithmeticOp operation> struct RED_CLU
 {
    static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
    {
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        ng = ng / nw;
        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ...\n",
                 operation_names(operation), TypeManager<Ty>::name(),
                 sizeof(Ty));
        genrand<Ty, operation>(x, t, m, ns, nw, ng);
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        int nj = (nw + ns - 1) / ns;
        ng = ng / nw;
        for (int k = 0; k < ng; ++k)
        {
            std::vector<cl_int> data_type_sizes;
            // Map to array indexed to array indexed by local ID and sub group
            for (int j = 0; j < nw; ++j)
            {
                mx[j] = x[j];
                my[j] = y[j];
                data_type_sizes.push_back(m[4 * j + 3]);
            }
            for (cl_int dts : data_type_sizes)
            {
                if (dts != sizeof(Ty))
                {
                    log_error("ERROR: sub_group_clustered_reduce_%s(%s) "
                              "wrong data type size detected, expected: %d, "
                              "used by device %d, in group %d\n",
                              operation_names(operation),
                              TypeManager<Ty>::name(), sizeof(Ty), dts, k);
                    return TEST_FAIL;
                }
            }
            for (int j = 0; j < nj; ++j)
            {
                int ii = j * ns;
                int n = ii + ns > nw ? nw - ii : ns;
                int midx = 4 * ii + 2;
                std::vector<Ty> clusters_results;
                int clusters_counter = ns / CLUSTER_SIZE;
                clusters_results.resize(clusters_counter);
                // Compute target
                Ty tr = mx[ii];
                for (int i = 0; i < n; ++i)
                {
                    if (i % CLUSTER_SIZE == 0)
                        tr = mx[ii + i];
                    else
                        tr = calculate<Ty>(tr, mx[ii + i], operation);
                    clusters_results[i / CLUSTER_SIZE] = tr;
                }
                // Check result
                for (int i = 0; i < n; ++i)
                {
                    Ty rr = my[ii + i];
                    tr = clusters_results[i / CLUSTER_SIZE];
                    if (!compare(rr, tr))
                    {
                        log_error(
                            "ERROR: sub_group_clustered_reduce_%s(%s) mismatch "
                            "for local id %d in sub group %d in group %d\n",
                            operation_names(operation), TypeManager<Ty>::name(),
                            i, j, k);
                        return TEST_FAIL;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n",
                 operation_names(operation), TypeManager<Ty>::name(),
                 sizeof(Ty));
        return TEST_PASS;
    }
 };
 template <typename T>
 int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
        "test_redadd_clustered", redadd_clustered_source);
    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
        "test_redmax_clustered", redmax_clustered_source);
    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
        "test_redmin_clustered", redmin_clustered_source);
    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
        "test_redmul_clustered", redmul_clustered_source);
    return error;
 }
 template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
        "test_redand_clustered", redand_clustered_source);
    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
        "test_redor_clustered", redor_clustered_source);
    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
        "test_redxor_clustered", redxor_clustered_source);
    return error;
 }
 template <typename T>
 int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
        "test_redand_clustered_logical", redand_clustered_logical_source);
    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
        "test_redor_clustered_logical", redor_clustered_logical_source);
    error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
        "test_redxor_clustered_logical", redxor_clustered_logical_source);
    return error;
 }
 }
 int test_subgroup_functions_clustered_reduce(cl_device_id device,
                                             cl_context context,
                                             cl_command_queue queue,
                                             int num_elements)
 {
    std::vector<std::string> required_extensions = {
        "cl_khr_subgroup_clustered_reduce"
    };
    constexpr size_t global_work_size = 2000;
    constexpr size_t local_work_size = 200;
    WorkGroupParams test_params(global_work_size, local_work_size,
                                required_extensions);
    RunTestForType rft(device, context, queue, num_elements, test_params);
    int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_uint>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_long>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_ulong>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_short>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_ushort>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_char>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_uchar>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_float>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<cl_double>(rft);
    error |= run_cluster_red_add_max_min_mul_for_type<subgroups::cl_half>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_int>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_uint>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_long>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_ulong>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_short>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_ushort>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_char>(rft);
    error |= run_cluster_and_or_xor_for_type<cl_uchar>(rft);
    error |= run_cluster_logical_and_or_xor_for_type<cl_int>(rft);
    return error;
 }
--- a/test_conformance/subgroups/test_subgroup_extended_types.cpp
+++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp
@@ -0,0 +1,138 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "subgroup_common_kernels.h"
 #include "subgroup_common_templates.h"
 #include "harness/typeWrappers.h"
 namespace {
 template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
        "test_bcast", bcast_source);
    return error;
 }
 template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
                                                               redadd_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
                                                            redmax_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
                                                            redmin_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
                                                             scinadd_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
                                                             scinmax_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
                                                             scinmin_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
                                                             scexadd_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
                                                             scexmax_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
                                                             scexmin_source);
    return error;
 }
 }
 int test_subgroup_functions_extended_types(cl_device_id device,
                                           cl_context context,
                                           cl_command_queue queue,
                                           int num_elements)
 {
    std::vector<std::string> required_extensions = {
        "cl_khr_subgroup_extended_types"
    };
    constexpr size_t global_work_size = 2000;
    constexpr size_t local_work_size = 200;
    WorkGroupParams test_params(global_work_size, local_work_size,
                                required_extensions);
    RunTestForType rft(device, context, queue, num_elements, test_params);
    int error = run_broadcast_for_extended_type<cl_uint2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
    error |= run_broadcast_for_extended_type<cl_uint4>(rft);
    error |= run_broadcast_for_extended_type<cl_uint8>(rft);
    error |= run_broadcast_for_extended_type<cl_uint16>(rft);
    error |= run_broadcast_for_extended_type<cl_int2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_int3>(rft);
    error |= run_broadcast_for_extended_type<cl_int4>(rft);
    error |= run_broadcast_for_extended_type<cl_int8>(rft);
    error |= run_broadcast_for_extended_type<cl_int16>(rft);
    error |= run_broadcast_for_extended_type<cl_ulong2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_ulong3>(rft);
    error |= run_broadcast_for_extended_type<cl_ulong4>(rft);
    error |= run_broadcast_for_extended_type<cl_ulong8>(rft);
    error |= run_broadcast_for_extended_type<cl_ulong16>(rft);
    error |= run_broadcast_for_extended_type<cl_long2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_long3>(rft);
    error |= run_broadcast_for_extended_type<cl_long4>(rft);
    error |= run_broadcast_for_extended_type<cl_long8>(rft);
    error |= run_broadcast_for_extended_type<cl_long16>(rft);
    error |= run_broadcast_for_extended_type<cl_float2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_float3>(rft);
    error |= run_broadcast_for_extended_type<cl_float4>(rft);
    error |= run_broadcast_for_extended_type<cl_float8>(rft);
    error |= run_broadcast_for_extended_type<cl_float16>(rft);
    error |= run_broadcast_for_extended_type<cl_double2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_double3>(rft);
    error |= run_broadcast_for_extended_type<cl_double4>(rft);
    error |= run_broadcast_for_extended_type<cl_double8>(rft);
    error |= run_broadcast_for_extended_type<cl_double16>(rft);
    error |= run_broadcast_for_extended_type<cl_ushort2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_ushort3>(rft);
    error |= run_broadcast_for_extended_type<cl_ushort4>(rft);
    error |= run_broadcast_for_extended_type<cl_ushort8>(rft);
    error |= run_broadcast_for_extended_type<cl_ushort16>(rft);
    error |= run_broadcast_for_extended_type<cl_short2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_short3>(rft);
    error |= run_broadcast_for_extended_type<cl_short4>(rft);
    error |= run_broadcast_for_extended_type<cl_short8>(rft);
    error |= run_broadcast_for_extended_type<cl_short16>(rft);
    error |= run_broadcast_for_extended_type<cl_uchar2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_uchar3>(rft);
    error |= run_broadcast_for_extended_type<cl_uchar4>(rft);
    error |= run_broadcast_for_extended_type<cl_uchar8>(rft);
    error |= run_broadcast_for_extended_type<cl_uchar16>(rft);
    error |= run_broadcast_for_extended_type<cl_char2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_char3>(rft);
    error |= run_broadcast_for_extended_type<cl_char4>(rft);
    error |= run_broadcast_for_extended_type<cl_char8>(rft);
    error |= run_broadcast_for_extended_type<cl_char16>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_half2>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_half3>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_half4>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_half8>(rft);
    error |= run_broadcast_for_extended_type<subgroups::cl_half16>(rft);
    error |= run_scan_reduction_for_type<cl_uchar>(rft);
    error |= run_scan_reduction_for_type<cl_char>(rft);
    error |= run_scan_reduction_for_type<cl_ushort>(rft);
    error |= run_scan_reduction_for_type<cl_short>(rft);
    return error;
 }
--- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp
@@ -0,0 +1,473 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "harness/typeWrappers.h"
 #include "subgroup_common_templates.h"
 namespace {
 static const char *scinadd_non_uniform_source = R"(
    __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
            }
    }
 )";
 static const char *scinmax_non_uniform_source = R"(
    __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
            }
    }
 )";
 static const char *scinmin_non_uniform_source = R"(
    __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
            }
    }
 )";
 static const char *scinmul_non_uniform_source = R"(
    __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
            }
    }
 )";
 static const char *scinand_non_uniform_source = R"(
    __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
            }
    }
 )";
 static const char *scinor_non_uniform_source = R"(
    __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
            }
    }
 )";
 static const char *scinxor_non_uniform_source = R"(
    __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
            }
    }
 )";
 static const char *scinand_non_uniform_logical_source = R"(
    __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
            }
    }
 )";
 static const char *scinor_non_uniform_logical_source = R"(
    __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
            }
    }
 )";
 static const char *scinxor_non_uniform_logical_source = R"(
    __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
            }
    }
 )";
 static const char *scexadd_non_uniform_source = R"(
    __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
            }
    }
 )";
 static const char *scexmax_non_uniform_source = R"(
    __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
            }
    }
 )";
 static const char *scexmin_non_uniform_source = R"(
    __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
            }
    }
 )";
 static const char *scexmul_non_uniform_source = R"(
    __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
            }
    }
 )";
 static const char *scexand_non_uniform_source = R"(
    __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
            }
    }
 )";
 static const char *scexor_non_uniform_source = R"(
    __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
            }
    }
 )";
 static const char *scexxor_non_uniform_source = R"(
    __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
            }
    }
 )";
 static const char *scexand_non_uniform_logical_source = R"(
    __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
            }
    }
 )";
 static const char *scexor_non_uniform_logical_source = R"(
    __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
            }
    }
 )";
 static const char *scexxor_non_uniform_logical_source = R"(
    __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
            }
    }
 )";
 static const char *redadd_non_uniform_source = R"(
    __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
            }
    }
 )";
 static const char *redmax_non_uniform_source = R"(
    __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
            }
    }
 )";
 static const char *redmin_non_uniform_source = R"(
    __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
            }
    }
 )";
 static const char *redmul_non_uniform_source = R"(
    __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
            }
    }
 )";
 static const char *redand_non_uniform_source = R"(
    __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
            }
    }
 )";
 static const char *redor_non_uniform_source = R"(
    __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
            }
    }
 )";
 static const char *redxor_non_uniform_source = R"(
    __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
            }
    }
 )";
 static const char *redand_non_uniform_logical_source = R"(
    __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
            }
    }
 )";
 static const char *redor_non_uniform_logical_source = R"(
    __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
            }
    }
 )";
 static const char *redxor_non_uniform_logical_source = R"(
    __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        int elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
            }
    }
 )";
 template <typename T>
 int run_functions_add_mul_max_min_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
        "test_scinadd_non_uniform", scinadd_non_uniform_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
        "test_scinmul_non_uniform", scinmul_non_uniform_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
        "test_scinmax_non_uniform", scinmax_non_uniform_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
        "test_scinmin_non_uniform", scinmin_non_uniform_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
        "test_scexadd_non_uniform", scexadd_non_uniform_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
        "test_scexmul_non_uniform", scexmul_non_uniform_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
        "test_scexmax_non_uniform", scexmax_non_uniform_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
        "test_scexmin_non_uniform", scexmin_non_uniform_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
        "test_redadd_non_uniform", redadd_non_uniform_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
        "test_redmul_non_uniform", redmul_non_uniform_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
        "test_redmax_non_uniform", redmax_non_uniform_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
        "test_redmin_non_uniform", redmin_non_uniform_source);
    return error;
 }
 template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
        "test_scinand_non_uniform", scinand_non_uniform_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
        "test_scinor_non_uniform", scinor_non_uniform_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
        "test_scinxor_non_uniform", scinxor_non_uniform_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
        "test_scexand_non_uniform", scexand_non_uniform_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
        "test_scexor_non_uniform", scexor_non_uniform_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
        "test_scexxor_non_uniform", scexxor_non_uniform_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
        "test_redand_non_uniform", redand_non_uniform_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
        "test_redor_non_uniform", redor_non_uniform_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
        "test_redxor_non_uniform", redxor_non_uniform_source);
    return error;
 }
 template <typename T>
 int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
        "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
        "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
    error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
        "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
        "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
        "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
    error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
        "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
        "test_redand_non_uniform_logical", redand_non_uniform_logical_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
        "test_redor_non_uniform_logical", redor_non_uniform_logical_source);
    error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
        "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
    return error;
 }
 }
 int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
                                                   cl_context context,
                                                   cl_command_queue queue,
                                                   int num_elements)
 {
    std::vector<std::string> required_extensions = {
        "cl_khr_subgroup_non_uniform_arithmetic"
    };
    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
                                 0x00ffff00, 0x80000000, 0xaaaaaaaa };
    constexpr size_t global_work_size = 2000;
    constexpr size_t local_work_size = 200;
    WorkGroupParams test_params(global_work_size, local_work_size,
                                required_extensions, masks);
    RunTestForType rft(device, context, queue, num_elements, test_params);
    int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_uint>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_long>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_ulong>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_short>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_ushort>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_char>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_uchar>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_float>(rft);
    error |= run_functions_add_mul_max_min_for_type<cl_double>(rft);
    error |= run_functions_add_mul_max_min_for_type<subgroups::cl_half>(rft);
    error |= run_functions_and_or_xor_for_type<cl_int>(rft);
    error |= run_functions_and_or_xor_for_type<cl_uint>(rft);
    error |= run_functions_and_or_xor_for_type<cl_long>(rft);
    error |= run_functions_and_or_xor_for_type<cl_ulong>(rft);
    error |= run_functions_and_or_xor_for_type<cl_short>(rft);
    error |= run_functions_and_or_xor_for_type<cl_ushort>(rft);
    error |= run_functions_and_or_xor_for_type<cl_char>(rft);
    error |= run_functions_and_or_xor_for_type<cl_uchar>(rft);
    error |= run_functions_logical_and_or_xor_for_type<cl_int>(rft);
    return error;
 }
--- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
+++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp
@@ -0,0 +1,303 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "harness/typeWrappers.h"
 #include <set>
 namespace {
 template <typename T, NonUniformVoteOp operation> struct VOTE
 {
    static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params)
    {
        int i, ii, j, k, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        int nj = (nw + ns - 1) / ns;
        int non_uniform_size = ng % nw;
        ng = ng / nw;
        int last_subgroup_size = 0;
        ii = 0;
        log_info("  sub_group_%s%s... \n",
                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
                 operation_names(operation));
        log_info("  test params: global size = %d local size = %d subgroups "
                 "size = %d work item mask = 0x%x data type (%s)\n",
                 test_params.global_workgroup_size, nw, ns, work_items_mask,
                 TypeManager<T>::name());
        if (non_uniform_size)
        {
            log_info("  non uniform work group size mode ON\n");
        }
        if (operation == NonUniformVoteOp::elect) return;
        for (k = 0; k < ng; ++k)
        { // for each work_group
            if (non_uniform_size && k == ng - 1)
            {
                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
                                          last_subgroup_size);
            }
            for (j = 0; j < nj; ++j)
            { // for each subgroup
                ii = j * ns;
                if (last_subgroup_size && j == nj - 1)
                {
                    n = last_subgroup_size;
                }
                else
                {
                    n = ii + ns > nw ? nw - ii : ns;
                }
                int e = genrand_int32(gMTdata) % 3;
                for (i = 0; i < n; i++)
                {
                    if (e == 2)
                    { // set once 0 and once 1 alternately
                        int value = i % 2;
                        set_value(t[ii + i], value);
                    }
                    else
                    { // set 0/1 for all work items in subgroup
                        set_value(t[ii + i], e);
                    }
                }
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
                x[j] = t[j];
            }
            x += nw;
            m += 4 * nw;
        }
    }
    static int chk(T *x, T *y, T *mx, T *my, cl_int *m,
                   const WorkGroupParams &test_params)
    {
        int ii, i, j, k, n;
        int nw = test_params.local_workgroup_size;
        int ns = test_params.subgroup_size;
        int ng = test_params.global_workgroup_size;
        uint32_t work_items_mask = test_params.work_items_mask;
        int nj = (nw + ns - 1) / ns;
        cl_int tr, rr;
        int non_uniform_size = ng % nw;
        ng = ng / nw;
        if (non_uniform_size) ng++;
        int last_subgroup_size = 0;
        for (k = 0; k < ng; ++k)
        { // for each work_group
            if (non_uniform_size && k == ng - 1)
            {
                set_last_workgroup_params(non_uniform_size, nj, ns, nw,
                                          last_subgroup_size);
            }
            for (j = 0; j < nw; ++j)
            { // inside the work_group
                mx[j] = x[j]; // read host inputs for work_group
                my[j] = y[j]; // read device outputs for work_group
            }
            for (j = 0; j < nj; ++j)
            { // for each subgroup
                ii = j * ns;
                if (last_subgroup_size && j == nj - 1)
                {
                    n = last_subgroup_size;
                }
                else
                {
                    n = ii + ns > nw ? nw - ii : ns;
                }
                rr = 0;
                if (operation == NonUniformVoteOp::all
                    || operation == NonUniformVoteOp::all_equal)
                    tr = 1;
                if (operation == NonUniformVoteOp::any) tr = 0;
                std::set<int> active_work_items;
                for (i = 0; i < n; ++i)
                {
                    uint32_t check_work_item = 1 << (i % 32);
                    if (work_items_mask & check_work_item)
                    {
                        active_work_items.insert(i);
                        switch (operation)
                        {
                            case NonUniformVoteOp::elect: break;
                            case NonUniformVoteOp::all:
                                tr &=
                                    !compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
                                break;
                            case NonUniformVoteOp::any:
                                tr |=
                                    !compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
                                break;
                            case NonUniformVoteOp::all_equal:
                                tr &= compare_ordered<T>(
                                          mx[ii + i],
                                          mx[ii + *active_work_items.begin()])
                                    ? 1
                                    : 0;
                                break;
                            default:
                                log_error("Unknown operation\n");
                                return TEST_FAIL;
                        }
                    }
                }
                if (active_work_items.empty())
                {
                    log_info("  no one workitem acitve... in workgroup id = %d "
                             "subgroup id = %d\n",
                             k, j);
                }
                else
                {
                    auto lowest_active = active_work_items.begin();
                    for (const int &active_work_item : active_work_items)
                    {
                        i = active_work_item;
                        if (operation == NonUniformVoteOp::elect)
                        {
                            i == *lowest_active ? tr = 1 : tr = 0;
                        }
                        // normalize device values on host, non zero set 1.
                        rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
                        if (rr != tr)
                        {
                            log_error("ERROR: sub_group_%s() \n",
                                      operation_names(operation));
                            log_error(
                                "mismatch for work item %d sub group %d in "
                                "work group %d. Expected: %d Obtained: %d\n",
                                i, j, k, tr, rr);
                            return TEST_FAIL;
                        }
                    }
                }
            }
            x += nw;
            y += nw;
            m += 4 * nw;
        }
        log_info("  sub_group_%s%s... passed\n",
                 (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
                 operation_names(operation));
        return TEST_PASS;
    }
 };
 static const char *elect_source = R"(
    __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_elect();
            }
    }
 )";
 static const char *non_uniform_any_source = R"(
    __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_any(in[gid]);
            }
    }
 )";
 static const char *non_uniform_all_source = R"(
    __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_all(in[gid]);
            }
    }
 )";
 static const char *non_uniform_all_equal_source = R"(
    __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
        int gid = get_global_id(0);
        XY(xy,gid);
        uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
            if (elect_work_item & WORK_ITEMS_MASK){
                out[gid] = sub_group_non_uniform_all_equal(in[gid]);
            }
    }
 )";
 template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
        "test_non_uniform_all_equal", non_uniform_all_equal_source);
    return error;
 }
 }
 int test_subgroup_functions_non_uniform_vote(cl_device_id device,
                                             cl_context context,
                                             cl_command_queue queue,
                                             int num_elements)
 {
    std::vector<std::string> required_extensions = {
        "cl_khr_subgroup_non_uniform_vote"
    };
    std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
                                 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
                                 0x00ffff00, 0x80000000 };
    constexpr size_t global_work_size = 170;
    constexpr size_t local_work_size = 64;
    WorkGroupParams test_params(global_work_size, local_work_size,
                                required_extensions, masks);
    RunTestForType rft(device, context, queue, num_elements, test_params);
    int error = run_vote_all_equal_for_type<cl_int>(rft);
    error |= run_vote_all_equal_for_type<cl_uint>(rft);
    error |= run_vote_all_equal_for_type<cl_long>(rft);
    error |= run_vote_all_equal_for_type<cl_ulong>(rft);
    error |= run_vote_all_equal_for_type<cl_float>(rft);
    error |= run_vote_all_equal_for_type<cl_double>(rft);
    error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
        "test_non_uniform_all", non_uniform_all_source);
    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
        "test_elect", elect_source);
    error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
        "test_non_uniform_any", non_uniform_any_source);
    return error;
 }
--- a/test_conformance/subgroups/test_subgroup_shuffle.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp
@@ -0,0 +1,78 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "subgroup_common_templates.h"
 #include "harness/typeWrappers.h"
 #include <bitset>
 namespace {
 static const char* shuffle_xor_source =
    "__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
    "__global int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    Type x = in[gid];\n"
    "    out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
    "}\n";
 static const char* shuffle_source =
    "__kernel void test_sub_group_shuffle(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    Type x = in[gid];\n"
    "    out[gid] = sub_group_shuffle(x, xy[gid].z);"
    "}\n";
 template <typename T> int run_shuffle_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
        "test_sub_group_shuffle", shuffle_source);
    error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
        "test_sub_group_shuffle_xor", shuffle_xor_source);
    return error;
 }
 }
 int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
                                    cl_command_queue queue, int num_elements)
 {
    std::vector<std::string> required_extensions{ "cl_khr_subgroup_shuffle" };
    constexpr size_t global_work_size = 2000;
    constexpr size_t local_work_size = 200;
    WorkGroupParams test_params(global_work_size, local_work_size,
                                required_extensions);
    RunTestForType rft(device, context, queue, num_elements, test_params);
    int error = run_shuffle_for_type<cl_int>(rft);
    error |= run_shuffle_for_type<cl_uint>(rft);
    error |= run_shuffle_for_type<cl_long>(rft);
    error |= run_shuffle_for_type<cl_ulong>(rft);
    error |= run_shuffle_for_type<cl_short>(rft);
    error |= run_shuffle_for_type<cl_ushort>(rft);
    error |= run_shuffle_for_type<cl_char>(rft);
    error |= run_shuffle_for_type<cl_uchar>(rft);
    error |= run_shuffle_for_type<cl_float>(rft);
    error |= run_shuffle_for_type<cl_double>(rft);
    error |= run_shuffle_for_type<subgroups::cl_half>(rft);
    return error;
 }
--- a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
+++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp
@@ -0,0 +1,81 @@
 //
 // Copyright (c) 2021 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "subgroup_common_templates.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 namespace {
 static const char* shuffle_down_source =
    "__kernel void test_sub_group_shuffle_down(const __global Type *in, "
    "__global int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    Type x = in[gid];\n"
    "    out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
    "}\n";
 static const char* shuffle_up_source =
    "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
    "int4 *xy, __global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    Type x = in[gid];\n"
    "    out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
    "}\n";
 template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
 {
    int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
        "test_sub_group_shuffle_up", shuffle_up_source);
    error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
        "test_sub_group_shuffle_down", shuffle_down_source);
    return error;
 }
 }
 int test_subgroup_functions_shuffle_relative(cl_device_id device,
                                             cl_context context,
                                             cl_command_queue queue,
                                             int num_elements)
 {
    std::vector<std::string> required_extensions = {
        "cl_khr_subgroup_shuffle_relative"
    };
    constexpr size_t global_work_size = 2000;
    constexpr size_t local_work_size = 200;
    WorkGroupParams test_params(global_work_size, local_work_size,
                                required_extensions);
    RunTestForType rft(device, context, queue, num_elements, test_params);
    int error = run_shuffle_relative_for_type<cl_int>(rft);
    error |= run_shuffle_relative_for_type<cl_uint>(rft);
    error |= run_shuffle_relative_for_type<cl_long>(rft);
    error |= run_shuffle_relative_for_type<cl_ulong>(rft);
    error |= run_shuffle_relative_for_type<cl_short>(rft);
    error |= run_shuffle_relative_for_type<cl_ushort>(rft);
    error |= run_shuffle_relative_for_type<cl_char>(rft);
    error |= run_shuffle_relative_for_type<cl_uchar>(rft);
    error |= run_shuffle_relative_for_type<cl_float>(rft);
    error |= run_shuffle_relative_for_type<cl_double>(rft);
    error |= run_shuffle_relative_for_type<subgroups::cl_half>(rft);
    return error;
 }
--- a/test_conformance/subgroups/test_workgroup.cpp
+++ b/test_conformance/subgroups/test_workgroup.cpp
@@ -1,727 +0,0 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 #include "procs.h"
 #include "subhelpers.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 static const char *any_source = "__kernel void test_any(const __global Type "
                                "*in, __global int2 *xy, __global Type *out)\n"
                                "{\n"
                                "    int gid = get_global_id(0);\n"
                                "    XY(xy,gid);\n"
                                "    out[gid] = sub_group_any(in[gid]);\n"
                                "}\n";
 static const char *all_source = "__kernel void test_all(const __global Type "
                                "*in, __global int2 *xy, __global Type *out)\n"
                                "{\n"
                                "    int gid = get_global_id(0);\n"
                                "    XY(xy,gid);\n"
                                "    out[gid] = sub_group_all(in[gid]);\n"
                                "}\n";
 static const char *bcast_source =
    "__kernel void test_bcast(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    Type x = in[gid];\n"
    "    size_t loid = (size_t)((int)x % 100);\n"
    "    out[gid] = sub_group_broadcast(x, loid);\n"
    "}\n";
 static const char *redadd_source =
    "__kernel void test_redadd(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_reduce_add(in[gid]);\n"
    "}\n";
 static const char *redmax_source =
    "__kernel void test_redmax(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_reduce_max(in[gid]);\n"
    "}\n";
 static const char *redmin_source =
    "__kernel void test_redmin(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_reduce_min(in[gid]);\n"
    "}\n";
 static const char *scinadd_source =
    "__kernel void test_scinadd(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
    "}\n";
 static const char *scinmax_source =
    "__kernel void test_scinmax(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
    "}\n";
 static const char *scinmin_source =
    "__kernel void test_scinmin(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
    "}\n";
 static const char *scexadd_source =
    "__kernel void test_scexadd(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
    "}\n";
 static const char *scexmax_source =
    "__kernel void test_scexmax(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
    "}\n";
 static const char *scexmin_source =
    "__kernel void test_scexmin(const __global Type *in, __global int2 *xy, "
    "__global Type *out)\n"
    "{\n"
    "    int gid = get_global_id(0);\n"
    "    XY(xy,gid);\n"
    "    out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
    "}\n";
 // Any/All test functions
 template <int Which> struct AA
 {
    static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
    {
        int i, ii, j, k, n;
        int nj = (nw + ns - 1) / ns;
        int e;
        ii = 0;
        for (k = 0; k < ng; ++k)
        {
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                e = (int)(genrand_int32(gMTdata) % 3);
                // Initialize data matrix indexed by local id and sub group id
                switch (e)
                {
                    case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
                    case 1:
                        memset(&t[ii], 0, n * sizeof(cl_int));
                        i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
                        t[ii + i] = 41;
                        break;
                    case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
                }
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                x[j] = t[i];
            }
            x += nw;
            m += 2 * nw;
        }
    }
    static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
                   int ns, int nw, int ng)
    {
        int ii, i, j, k, n;
        int nj = (nw + ns - 1) / ns;
        cl_int taa, raa;
        log_info("  sub_group_%s...\n", Which == 0 ? "any" : "all");
        for (k = 0; k < ng; ++k)
        {
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                mx[i] = x[j];
                my[i] = y[j];
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                // Compute target
                if (Which == 0)
                {
                    taa = 0;
                    for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
                }
                else
                {
                    taa = 1;
                    for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
                }
                // Check result
                for (i = 0; i < n; ++i)
                {
                    raa = my[ii + i] != 0;
                    if (raa != taa)
                    {
                        log_error("ERROR: sub_group_%s mismatch for local id "
                                  "%d in sub group %d in group %d\n",
                                  Which == 0 ? "any" : "all", i, j, k);
                        return -1;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 2 * nw;
        }
        return 0;
    }
 };
 // Reduce functions
 template <typename Ty, int Which> struct RED
 {
    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
    {
        int i, ii, j, k, n;
        int nj = (nw + ns - 1) / ns;
        ii = 0;
        for (k = 0; k < ng; ++k)
        {
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                for (i = 0; i < n; ++i)
                    t[ii + i] = (Ty)(
                        (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                x[j] = t[i];
            }
            x += nw;
            m += 2 * nw;
        }
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
                   int ng)
    {
        int ii, i, j, k, n;
        int nj = (nw + ns - 1) / ns;
        Ty tr, rr;
        log_info("  sub_group_reduce_%s(%s)...\n",
                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
                 TypeName<Ty>::val());
        for (k = 0; k < ng; ++k)
        {
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                mx[i] = x[j];
                my[i] = y[j];
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                // Compute target
                if (Which == 0)
                {
                    // add
                    tr = mx[ii];
                    for (i = 1; i < n; ++i) tr += mx[ii + i];
                }
                else if (Which == 1)
                {
                    // max
                    tr = mx[ii];
                    for (i = 1; i < n; ++i)
                        tr = tr > mx[ii + i] ? tr : mx[ii + i];
                }
                else if (Which == 2)
                {
                    // min
                    tr = mx[ii];
                    for (i = 1; i < n; ++i)
                        tr = tr > mx[ii + i] ? mx[ii + i] : tr;
                }
                // Check result
                for (i = 0; i < n; ++i)
                {
                    rr = my[ii + i];
                    if (rr != tr)
                    {
                        log_error("ERROR: sub_group_reduce_%s(%s) mismatch for "
                                  "local id %d in sub group %d in group %d\n",
                                  Which == 0 ? "add"
                                             : (Which == 1 ? "max" : "min"),
                                  TypeName<Ty>::val(), i, j, k);
                        return -1;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 2 * nw;
        }
        return 0;
    }
 };
 // Scan Inclusive functions
 template <typename Ty, int Which> struct SCIN
 {
    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
    {
        int i, ii, j, k, n;
        int nj = (nw + ns - 1) / ns;
        ii = 0;
        for (k = 0; k < ng; ++k)
        {
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                for (i = 0; i < n; ++i)
                    // t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
                    // % ns + 1);
                    t[ii + i] = (Ty)i;
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                x[j] = t[i];
            }
            x += nw;
            m += 2 * nw;
        }
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
                   int ng)
    {
        int ii, i, j, k, n;
        int nj = (nw + ns - 1) / ns;
        Ty tr, rr;
        log_info("  sub_group_scan_inclusive_%s(%s)...\n",
                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
                 TypeName<Ty>::val());
        for (k = 0; k < ng; ++k)
        {
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                mx[i] = x[j];
                my[i] = y[j];
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                // Check result
                for (i = 0; i < n; ++i)
                {
                    if (Which == 0)
                    {
                        tr = i == 0 ? mx[ii] : tr + mx[ii + i];
                    }
                    else if (Which == 1)
                    {
                        tr = i == 0 ? mx[ii]
                                    : (tr > mx[ii + i] ? tr : mx[ii + i]);
                    }
                    else
                    {
                        tr = i == 0 ? mx[ii]
                                    : (tr > mx[ii + i] ? mx[ii + i] : tr);
                    }
                    rr = my[ii + i];
                    if (rr != tr)
                    {
                        log_error(
                            "ERROR: sub_group_scan_inclusive_%s(%s) mismatch "
                            "for local id %d in sub group %d in group %d\n",
                            Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
                            TypeName<Ty>::val(), i, j, k);
                        return -1;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 2 * nw;
        }
        return 0;
    }
 };
 // Scan Exclusive functions
 template <typename Ty, int Which> struct SCEX
 {
    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
    {
        int i, ii, j, k, n;
        int nj = (nw + ns - 1) / ns;
        ii = 0;
        for (k = 0; k < ng; ++k)
        {
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                for (i = 0; i < n; ++i)
                    t[ii + i] = (Ty)(
                        (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                x[j] = t[i];
            }
            x += nw;
            m += 2 * nw;
        }
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
                   int ng)
    {
        int ii, i, j, k, n;
        int nj = (nw + ns - 1) / ns;
        Ty tr, trt, rr;
        log_info("  sub_group_scan_exclusive_%s(%s)...\n",
                 Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
                 TypeName<Ty>::val());
        for (k = 0; k < ng; ++k)
        {
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                mx[i] = x[j];
                my[i] = y[j];
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                // Check result
                for (i = 0; i < n; ++i)
                {
                    if (Which == 0)
                    {
                        tr = i == 0 ? TypeIdentity<Ty, Which>::val() : tr + trt;
                    }
                    else if (Which == 1)
                    {
                        tr = i == 0 ? TypeIdentity<Ty, Which>::val()
                                    : (trt > tr ? trt : tr);
                    }
                    else
                    {
                        tr = i == 0 ? TypeIdentity<Ty, Which>::val()
                                    : (trt > tr ? tr : trt);
                    }
                    trt = mx[ii + i];
                    rr = my[ii + i];
                    if (rr != tr)
                    {
                        log_error(
                            "ERROR: sub_group_scan_exclusive_%s(%s) mismatch "
                            "for local id %d in sub group %d in group %d\n",
                            Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
                            TypeName<Ty>::val(), i, j, k);
                        return -1;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 2 * nw;
        }
        return 0;
    }
 };
 // Broadcast functios
 template <typename Ty> struct BC
 {
    static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
    {
        int i, ii, j, k, l, n;
        int nj = (nw + ns - 1) / ns;
        int d = ns > 100 ? 100 : ns;
        ii = 0;
        for (k = 0; k < ng; ++k)
        {
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
                    % (d > n ? n : d);
                for (i = 0; i < n; ++i)
                    t[ii + i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
                                         % 100 * 100
                                     + l);
            }
            // Now map into work group using map from device
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                x[j] = t[i];
            }
            x += nw;
            m += 2 * nw;
        }
    }
    static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
                   int ng)
    {
        int ii, i, j, k, l, n;
        int nj = (nw + ns - 1) / ns;
        Ty tr, rr;
        log_info("  sub_group_broadcast(%s)...\n", TypeName<Ty>::val());
        for (k = 0; k < ng; ++k)
        {
            // Map to array indexed to array indexed by local ID and sub group
            for (j = 0; j < nw; ++j)
            {
                i = m[2 * j + 1] * ns + m[2 * j];
                mx[i] = x[j];
                my[i] = y[j];
            }
            for (j = 0; j < nj; ++j)
            {
                ii = j * ns;
                n = ii + ns > nw ? nw - ii : ns;
                l = (int)mx[ii] % 100;
                tr = mx[ii + l];
                // Check result
                for (i = 0; i < n; ++i)
                {
                    rr = my[ii + i];
                    if (rr != tr)
                    {
                        log_error("ERROR: sub_group_broadcast(%s) mismatch for "
                                  "local id %d in sub group %d in group %d\n",
                                  TypeName<Ty>::val(), i, j, k);
                        return -1;
                    }
                }
            }
            x += nw;
            y += nw;
            m += 2 * nw;
        }
        return 0;
    }
 };
 #define G 2000
 #define L 200
 struct run_for_type
 {
    run_for_type(cl_device_id device, cl_context context,
                 cl_command_queue queue, int num_elements,
                 bool useCoreSubgroups)
    {
        device_ = device;
        context_ = context;
        queue_ = queue;
        num_elements_ = num_elements;
        useCoreSubgroups_ = useCoreSubgroups;
    }
    template <typename T> cl_int run()
    {
        cl_int error;
        error = test<T, BC<T>, G, L>::run(device_, context_, queue_,
                                          num_elements_, "test_bcast",
                                          bcast_source, 0, useCoreSubgroups_);
        error |= test<T, RED<T, 0>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_redadd",
            redadd_source, 0, useCoreSubgroups_);
        error |= test<T, RED<T, 1>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_redmax",
            redmax_source, 0, useCoreSubgroups_);
        error |= test<T, RED<T, 2>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_redmin",
            redmin_source, 0, useCoreSubgroups_);
        error |= test<T, SCIN<T, 0>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_scinadd",
            scinadd_source, 0, useCoreSubgroups_);
        error |= test<T, SCIN<T, 1>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_scinmax",
            scinmax_source, 0, useCoreSubgroups_);
        error |= test<T, SCIN<T, 2>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_scinmin",
            scinmin_source, 0, useCoreSubgroups_);
        error |= test<T, SCEX<T, 0>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_scexadd",
            scexadd_source, 0, useCoreSubgroups_);
        error |= test<T, SCEX<T, 1>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_scexmax",
            scexmax_source, 0, useCoreSubgroups_);
        error |= test<T, SCEX<T, 2>, G, L>::run(
            device_, context_, queue_, num_elements_, "test_scexmin",
            scexmin_source, 0, useCoreSubgroups_);
        return error;
    }
 private:
    cl_device_id device_;
    cl_context context_;
    cl_command_queue queue_;
    int num_elements_;
    bool useCoreSubgroups_;
 };
 // Entry point from main
 int test_work_group_functions(cl_device_id device, cl_context context,
                              cl_command_queue queue, int num_elements,
                              bool useCoreSubgroups)
 {
    int error;
    error = test<int, AA<0>, G, L>::run(device, context, queue, num_elements,
                                        "test_any", any_source, 0,
                                        useCoreSubgroups);
    error |= test<int, AA<1>, G, L>::run(device, context, queue, num_elements,
                                         "test_all", all_source, 0,
                                         useCoreSubgroups);
    run_for_type rft(device, context, queue, num_elements, useCoreSubgroups);
    error |= rft.run<cl_uint>();
    error |= rft.run<cl_int>();
    error |= rft.run<cl_ulong>();
    error |= rft.run<cl_long>();
    error |= rft.run<float>();
    error |= rft.run<double>();
    // error |= rft.run<cl_half>();
    return error;
 }
 int test_work_group_functions_core(cl_device_id device, cl_context context,
                                   cl_command_queue queue, int num_elements)
 {
    return test_work_group_functions(device, context, queue, num_elements,
                                     true);
 }
 int test_work_group_functions_ext(cl_device_id device, cl_context context,
                                  cl_command_queue queue, int num_elements)
 {
    bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
    if (!hasExtension)
    {
        log_info(
            "Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
        return TEST_SKIPPED_ITSELF;
    }
    return test_work_group_functions(device, context, queue, num_elements,
                                     false);
 }